diff options
author | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-04-16 15:20:36 -0700 |
commit | 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch) | |
tree | 0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/ipv4/netfilter | |
download | linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.gz linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.bz2 linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.zip |
Linux-2.6.12-rc2v2.6.12-rc2
Initial git repository build. I'm not bothering with the full history,
even though we have it. We can create a separate "historical" git
archive of that later if we want to, and in the meantime it's about
3.2GB when imported into git - space that would just make the early
git days unnecessarily complicated, when we don't have a lot of good
infrastructure for it.
Let it rip!
Diffstat (limited to 'net/ipv4/netfilter')
76 files changed, 23448 insertions, 0 deletions
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig new file mode 100644 index 000000000000..46d4cb1c06f0 --- /dev/null +++ b/net/ipv4/netfilter/Kconfig @@ -0,0 +1,696 @@ +# +# IP netfilter configuration +# + +menu "IP: Netfilter Configuration" + depends on INET && NETFILTER + +# connection tracking, helpers and protocols +config IP_NF_CONNTRACK + tristate "Connection tracking (required for masq/NAT)" + ---help--- + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + This is required to do Masquerading or other kinds of Network + Address Translation (except for Fast NAT). It can also be used to + enhance packet filtering (see `Connection state match support' + below). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_CT_ACCT + bool "Connection tracking flow accounting" + depends on IP_NF_CONNTRACK + help + If this option is enabled, the connection tracking code will + keep per-flow packet and byte counters. + + Those counters can be used for flow-based accounting or the + `connbytes' match. + + If unsure, say `N'. + +config IP_NF_CONNTRACK_MARK + bool 'Connection mark tracking support' + help + This option enables support for connection marks, used by the + `CONNMARK' target and `connmark' match. Similar to the mark value + of packets, but this mark value is kept in the conntrack session + instead of the individual packets. + +config IP_NF_CT_PROTO_SCTP + tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' + depends on IP_NF_CONNTRACK && EXPERIMENTAL + help + With this option enabled, the connection tracking code will + be able to do state tracking on SCTP connections. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +config IP_NF_FTP + tristate "FTP protocol support" + depends on IP_NF_CONNTRACK + help + Tracking FTP connections is problematic: special helpers are + required for tracking them, and doing masquerading and other forms + of Network Address Translation on them. + + To compile it as a module, choose M here. If unsure, say Y. + +config IP_NF_IRC + tristate "IRC protocol support" + depends on IP_NF_CONNTRACK + ---help--- + There is a commonly-used extension to IRC called + Direct Client-to-Client Protocol (DCC). This enables users to send + files to each other, and also chat to each other without the need + of a server. DCC Sending is used anywhere you send files over IRC, + and DCC Chat is most commonly used by Eggdrop bots. If you are + using NAT, this extension will enable you to send files and initiate + chats. Note that you do NOT need this extension to get files or + have others initiate chats, or everything else in IRC. + + To compile it as a module, choose M here. If unsure, say Y. + +config IP_NF_TFTP + tristate "TFTP protocol support" + depends on IP_NF_CONNTRACK + help + TFTP connection tracking helper, this is required depending + on how restrictive your ruleset is. + If you are using a tftp client behind -j SNAT or -j MASQUERADING + you will need this. + + To compile it as a module, choose M here. If unsure, say Y. + +config IP_NF_AMANDA + tristate "Amanda backup protocol support" + depends on IP_NF_CONNTRACK + help + If you are running the Amanda backup package <http://www.amanda.org/> + on this machine or machines that will be MASQUERADED through this + machine, then you may want to enable this feature. This allows the + connection tracking and natting code to allow the sub-channels that + Amanda requires for communication of the backup data, messages and + index. + + To compile it as a module, choose M here. If unsure, say Y. + +config IP_NF_QUEUE + tristate "Userspace queueing via NETLINK" + help + Netfilter has the ability to queue packets to user space: the + netlink device can be used to access them using this driver. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_IPTABLES + tristate "IP tables support (required for filtering/masq/NAT)" + help + iptables is a general, extensible packet identification framework. + The packet filtering and full NAT (masquerading, port forwarding, + etc) subsystems now use this: say `Y' or `M' here if you want to use + either of those. + + To compile it as a module, choose M here. If unsure, say N. + +# The matches. +config IP_NF_MATCH_LIMIT + tristate "limit match support" + depends on IP_NF_IPTABLES + help + limit matching allows you to control the rate at which a rule can be + matched: mainly useful in combination with the LOG target ("LOG + target support", below) and to avoid some Denial of Service attacks. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_IPRANGE + tristate "IP range match support" + depends on IP_NF_IPTABLES + help + This option makes possible to match IP addresses against IP address + ranges. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_MAC + tristate "MAC address match support" + depends on IP_NF_IPTABLES + help + MAC matching allows you to match packets based on the source + Ethernet address of the packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_PKTTYPE + tristate "Packet type match support" + depends on IP_NF_IPTABLES + help + Packet type matching allows you to match a packet by + its "class", eg. BROADCAST, MULTICAST, ... + + Typical usage: + iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_MARK + tristate "netfilter MARK match support" + depends on IP_NF_IPTABLES + help + Netfilter mark matching allows you to match packets based on the + `nfmark' value in the packet. This can be set by the MARK target + (see below). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_MULTIPORT + tristate "Multiple port match support" + depends on IP_NF_IPTABLES + help + Multiport matching allows you to match TCP or UDP packets based on + a series of source or destination ports: normally a rule can only + match a single range of ports. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_TOS + tristate "TOS match support" + depends on IP_NF_IPTABLES + help + TOS matching allows you to match packets based on the Type Of + Service fields of the IP packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_RECENT + tristate "recent match support" + depends on IP_NF_IPTABLES + help + This match is used for creating one or many lists of recently + used addresses and then matching against that/those list(s). + + Short options are available by using 'iptables -m recent -h' + Official Website: <http://snowman.net/projects/ipt_recent/> + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_ECN + tristate "ECN match support" + depends on IP_NF_IPTABLES + help + This option adds a `ECN' match, which allows you to match against + the IPv4 and TCP header ECN fields. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_DSCP + tristate "DSCP match support" + depends on IP_NF_IPTABLES + help + This option adds a `DSCP' match, which allows you to match against + the IPv4 header DSCP field (DSCP codepoint). + + The DSCP codepoint can have any value between 0x0 and 0x4f. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_AH_ESP + tristate "AH/ESP match support" + depends on IP_NF_IPTABLES + help + These two match extensions (`ah' and `esp') allow you to match a + range of SPIs inside AH or ESP headers of IPSec packets. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_LENGTH + tristate "LENGTH match support" + depends on IP_NF_IPTABLES + help + This option allows you to match the length of a packet against a + specific value or range of values. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_TTL + tristate "TTL match support" + depends on IP_NF_IPTABLES + help + This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user + to match packets by their TTL value. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_TCPMSS + tristate "tcpmss match support" + depends on IP_NF_IPTABLES + help + This option adds a `tcpmss' match, which allows you to examine the + MSS value of TCP SYN packets, which control the maximum packet size + for that connection. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_HELPER + tristate "Helper match support" + depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + help + Helper matching allows you to match packets in dynamic connections + tracked by a conntrack-helper, ie. ip_conntrack_ftp + + To compile it as a module, choose M here. If unsure, say Y. + +config IP_NF_MATCH_STATE + tristate "Connection state match support" + depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + help + Connection state matching allows you to match packets based on their + relationship to a tracked connection (ie. previous packets). This + is a powerful tool for packet classification. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_CONNTRACK + tristate "Connection tracking match support" + depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + help + This is a general conntrack match module, a superset of the state match. + + It allows matching on additional conntrack information, which is + useful in complex configurations, such as NAT gateways with multiple + internet links or tunnels. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_OWNER + tristate "Owner match support" + depends on IP_NF_IPTABLES + help + Packet owner matching allows you to match locally-generated packets + based on who created them: the user, group, process or session. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_PHYSDEV + tristate "Physdev match support" + depends on IP_NF_IPTABLES && BRIDGE_NETFILTER + help + Physdev packet matching matches against the physical bridge ports + the IP packet arrived on or will leave by. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_ADDRTYPE + tristate 'address type match support' + depends on IP_NF_IPTABLES + help + This option allows you to match what routing thinks of an address, + eg. UNICAST, LOCAL, BROADCAST, ... + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +config IP_NF_MATCH_REALM + tristate 'realm match support' + depends on IP_NF_IPTABLES + select NET_CLS_ROUTE + help + This option adds a `realm' match, which allows you to use the realm + key from the routing subsystem inside iptables. + + This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option + in tc world. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +config IP_NF_MATCH_SCTP + tristate 'SCTP protocol match support' + depends on IP_NF_IPTABLES + help + With this option enabled, you will be able to use the iptables + `sctp' match in order to match on SCTP source/destination ports + and SCTP chunk types. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +config IP_NF_MATCH_COMMENT + tristate 'comment match support' + depends on IP_NF_IPTABLES + help + This option adds a `comment' dummy-match, which allows you to put + comments in your iptables ruleset. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +config IP_NF_MATCH_CONNMARK + tristate 'Connection mark match support' + depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES + help + This option adds a `connmark' match, which allows you to match the + connection mark value previously set for the session by `CONNMARK'. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. The module will be called + ipt_connmark.o. If unsure, say `N'. + +config IP_NF_MATCH_HASHLIMIT + tristate 'hashlimit match support' + depends on IP_NF_IPTABLES + help + This option adds a new iptables `hashlimit' match. + + As opposed to `limit', this match dynamically crates a hash table + of limit buckets, based on your selection of source/destination + ip addresses and/or ports. + + It enables you to express policies like `10kpps for any given + destination IP' or `500pps from any given source IP' with a single + IPtables rule. + +# `filter', generic and specific targets +config IP_NF_FILTER + tristate "Packet filtering" + depends on IP_NF_IPTABLES + help + Packet filtering defines a table `filter', which has a series of + rules for simple packet filtering at local input, forwarding and + local output. See the man page for iptables(8). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_REJECT + tristate "REJECT target support" + depends on IP_NF_FILTER + help + The REJECT target allows a filtering rule to specify that an ICMP + error should be issued in response to an incoming packet, rather + than silently being dropped. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_LOG + tristate "LOG target support" + depends on IP_NF_IPTABLES + help + This option adds a `LOG' target, which allows you to create rules in + any iptables table which records the packet header to the syslog. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_ULOG + tristate "ULOG target support" + depends on IP_NF_IPTABLES + ---help--- + This option adds a `ULOG' target, which allows you to create rules in + any iptables table. The packet is passed to a userspace logging + daemon using netlink multicast sockets; unlike the LOG target + which can only be viewed through syslog. + + The apropriate userspace logging daemon (ulogd) may be obtained from + <http://www.gnumonks.org/projects/ulogd/> + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_TCPMSS + tristate "TCPMSS target support" + depends on IP_NF_IPTABLES + ---help--- + This option adds a `TCPMSS' target, which allows you to alter the + MSS value of TCP SYN packets, to control the maximum size for that + connection (usually limiting it to your outgoing interface's MTU + minus 40). + + This is used to overcome criminally braindead ISPs or servers which + block ICMP Fragmentation Needed packets. The symptoms of this + problem are that everything works fine from your Linux + firewall/router, but machines behind it can never exchange large + packets: + 1) Web browsers connect, then hang with no data received. + 2) Small mail works fine, but large emails hang. + 3) ssh works fine, but scp hangs after initial handshaking. + + Workaround: activate this option and add a rule to your firewall + configuration like: + + iptables -A FORWARD -p tcp --tcp-flags SYN,RST SYN \ + -j TCPMSS --clamp-mss-to-pmtu + + To compile it as a module, choose M here. If unsure, say N. + +# NAT + specific targets +config IP_NF_NAT + tristate "Full NAT" + depends on IP_NF_IPTABLES && IP_NF_CONNTRACK + help + The Full NAT option allows masquerading, port forwarding and other + forms of full Network Address Port Translation. It is controlled by + the `nat' table in iptables: see the man page for iptables(8). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_NAT_NEEDED + bool + depends on IP_NF_NAT != n + default y + +config IP_NF_TARGET_MASQUERADE + tristate "MASQUERADE target support" + depends on IP_NF_NAT + help + Masquerading is a special case of NAT: all outgoing connections are + changed to seem to come from a particular interface's address, and + if the interface goes down, those connections are lost. This is + only useful for dialup accounts with dynamic IP address (ie. your IP + address will be different on next dialup). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_REDIRECT + tristate "REDIRECT target support" + depends on IP_NF_NAT + help + REDIRECT is a special case of NAT: all incoming connections are + mapped onto the incoming interface's address, causing the packets to + come to the local machine instead of passing through. This is + useful for transparent proxies. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_NETMAP + tristate "NETMAP target support" + depends on IP_NF_NAT + help + NETMAP is an implementation of static 1:1 NAT mapping of network + addresses. It maps the network address part, while keeping the host + address part intact. It is similar to Fast NAT, except that + Netfilter's connection tracking doesn't work well with Fast NAT. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_SAME + tristate "SAME target support" + depends on IP_NF_NAT + help + This option adds a `SAME' target, which works like the standard SNAT + target, but attempts to give clients the same IP for all connections. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_NAT_SNMP_BASIC + tristate "Basic SNMP-ALG support (EXPERIMENTAL)" + depends on EXPERIMENTAL && IP_NF_NAT + ---help--- + + This module implements an Application Layer Gateway (ALG) for + SNMP payloads. In conjunction with NAT, it allows a network + management system to access multiple private networks with + conflicting addresses. It works by modifying IP addresses + inside SNMP payloads to match IP-layer NAT mapping. + + This is the "basic" form of SNMP-ALG, as described in RFC 2962 + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_NAT_IRC + tristate + depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n + default IP_NF_NAT if IP_NF_IRC=y + default m if IP_NF_IRC=m + +# If they want FTP, set to $CONFIG_IP_NF_NAT (m or y), +# or $CONFIG_IP_NF_FTP (m or y), whichever is weaker. Argh. +config IP_NF_NAT_FTP + tristate + depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n + default IP_NF_NAT if IP_NF_FTP=y + default m if IP_NF_FTP=m + +config IP_NF_NAT_TFTP + tristate + depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n + default IP_NF_NAT if IP_NF_TFTP=y + default m if IP_NF_TFTP=m + +config IP_NF_NAT_AMANDA + tristate + depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n + default IP_NF_NAT if IP_NF_AMANDA=y + default m if IP_NF_AMANDA=m + +# mangle + specific targets +config IP_NF_MANGLE + tristate "Packet mangling" + depends on IP_NF_IPTABLES + help + This option adds a `mangle' table to iptables: see the man page for + iptables(8). This table is used for various packet alterations + which can effect how the packet is routed. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_TOS + tristate "TOS target support" + depends on IP_NF_MANGLE + help + This option adds a `TOS' target, which allows you to create rules in + the `mangle' table which alter the Type Of Service field of an IP + packet prior to routing. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_ECN + tristate "ECN target support" + depends on IP_NF_MANGLE + ---help--- + This option adds a `ECN' target, which can be used in the iptables mangle + table. + + You can use this target to remove the ECN bits from the IPv4 header of + an IP packet. This is particularly useful, if you need to work around + existing ECN blackholes on the internet, but don't want to disable + ECN support in general. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_DSCP + tristate "DSCP target support" + depends on IP_NF_MANGLE + help + This option adds a `DSCP' match, which allows you to match against + the IPv4 header DSCP field (DSCP codepoint). + + The DSCP codepoint can have any value between 0x0 and 0x4f. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_MARK + tristate "MARK target support" + depends on IP_NF_MANGLE + help + This option adds a `MARK' target, which allows you to create rules + in the `mangle' table which alter the netfilter mark (nfmark) field + associated with the packet prior to routing. This can change + the routing method (see `Use netfilter MARK value as routing + key') and can also be used by other subsystems to change their + behavior. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_CLASSIFY + tristate "CLASSIFY target support" + depends on IP_NF_MANGLE + help + This option adds a `CLASSIFY' target, which enables the user to set + the priority of a packet. Some qdiscs can use this value for + classification, among these are: + + atm, cbq, dsmark, pfifo_fast, htb, prio + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_CONNMARK + tristate 'CONNMARK target support' + depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE + help + This option adds a `CONNMARK' target, which allows one to manipulate + the connection mark value. Similar to the MARK target, but + affects the connection mark value rather than the packet mark value. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. The module will be called + ipt_CONNMARK.o. If unsure, say `N'. + +config IP_NF_TARGET_CLUSTERIP + tristate "CLUSTERIP target support (EXPERIMENTAL)" + depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES && EXPERIMENTAL + help + The CLUSTERIP target allows you to build load-balancing clusters of + network servers without having a dedicated load-balancing + router/server/switch. + + To compile it as a module, choose M here. If unsure, say N. + +# raw + specific targets +config IP_NF_RAW + tristate 'raw table support (required for NOTRACK/TRACE)' + depends on IP_NF_IPTABLES + help + This option adds a `raw' table to iptables. This table is the very + first in the netfilter framework and hooks in at the PREROUTING + and OUTPUT chains. + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + +config IP_NF_TARGET_NOTRACK + tristate 'NOTRACK target support' + depends on IP_NF_RAW + depends on IP_NF_CONNTRACK + help + The NOTRACK target allows a select rule to specify + which packets *not* to enter the conntrack/NAT + subsystem with all the consequences (no ICMP error tracking, + no protocol helpers for the selected packets). + + If you want to compile it as a module, say M here and read + <file:Documentation/modules.txt>. If unsure, say `N'. + + +# ARP tables +config IP_NF_ARPTABLES + tristate "ARP tables support" + help + arptables is a general, extensible packet identification framework. + The ARP packet filtering and mangling (manipulation)subsystems + use this: say Y or M here if you want to use either of those. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_ARPFILTER + tristate "ARP packet filtering" + depends on IP_NF_ARPTABLES + help + ARP packet filtering defines a table `filter', which has a series of + rules for simple ARP packet filtering at local input and + local output. On a bridge, you can also specify filtering rules + for forwarded ARP packets. See the man page for arptables(8). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_ARP_MANGLE + tristate "ARP payload mangling" + depends on IP_NF_ARPTABLES + help + Allows altering the ARP packet payload: source and destination + hardware and network addresses. + +endmenu + diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile new file mode 100644 index 000000000000..45796d5924dd --- /dev/null +++ b/net/ipv4/netfilter/Makefile @@ -0,0 +1,89 @@ +# +# Makefile for the netfilter modules on top of IPv4. +# + +# objects for the standalone - connection tracking / NAT +ip_conntrack-objs := ip_conntrack_standalone.o ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o +iptable_nat-objs := ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o + +# connection tracking +obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o + +# SCTP protocol connection tracking +obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o + +# connection tracking helpers +obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o +obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o +obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o +obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o + +# NAT helpers +obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o +obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o +obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o +obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o + +# generic IP tables +obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o + +# the three instances of ip_tables +obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o +obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o +obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o +obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o + +# matches +obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o +obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o +obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o +obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o +obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o +obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o +obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o +obj-$(CONFIG_IP_NF_MATCH_PKTTYPE) += ipt_pkttype.o +obj-$(CONFIG_IP_NF_MATCH_MULTIPORT) += ipt_multiport.o +obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o +obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o +obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o +obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o +obj-$(CONFIG_IP_NF_MATCH_DSCP) += ipt_dscp.o +obj-$(CONFIG_IP_NF_MATCH_AH_ESP) += ipt_ah.o ipt_esp.o +obj-$(CONFIG_IP_NF_MATCH_LENGTH) += ipt_length.o +obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o +obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o +obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o +obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o +obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o +obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o +obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o +obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o +obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o + +# targets +obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o +obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o +obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o +obj-$(CONFIG_IP_NF_TARGET_DSCP) += ipt_DSCP.o +obj-$(CONFIG_IP_NF_TARGET_MARK) += ipt_MARK.o +obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o +obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o +obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o +obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o +obj-$(CONFIG_IP_NF_TARGET_CLASSIFY) += ipt_CLASSIFY.o +obj-$(CONFIG_IP_NF_NAT_SNMP_BASIC) += ip_nat_snmp_basic.o +obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o +obj-$(CONFIG_IP_NF_TARGET_CONNMARK) += ipt_CONNMARK.o +obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o +obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o +obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o +obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o + +# generic ARP tables +obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o +obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o + +# just filtering instance of ARP tables for now +obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o + +obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c new file mode 100644 index 000000000000..df79f5ed6a0a --- /dev/null +++ b/net/ipv4/netfilter/arp_tables.c @@ -0,0 +1,1333 @@ +/* + * Packet matching code for ARP packets. + * + * Based heavily, if not almost entirely, upon ip_tables.c framework. + * + * Some ARP specific bits are: + * + * Copyright (C) 2002 David S. Miller (davem@redhat.com) + * + */ + +#include <linux/config.h> +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/if_arp.h> +#include <linux/kmod.h> +#include <linux/vmalloc.h> +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/init.h> + +#include <asm/uaccess.h> +#include <asm/semaphore.h> + +#include <linux/netfilter_arp/arp_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); +MODULE_DESCRIPTION("arptables core"); + +/*#define DEBUG_ARP_TABLES*/ +/*#define DEBUG_ARP_TABLES_USER*/ + +#ifdef DEBUG_ARP_TABLES +#define dprintf(format, args...) printk(format , ## args) +#else +#define dprintf(format, args...) +#endif + +#ifdef DEBUG_ARP_TABLES_USER +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#ifdef CONFIG_NETFILTER_DEBUG +#define ARP_NF_ASSERT(x) \ +do { \ + if (!(x)) \ + printk("ARP_NF_ASSERT: %s:%s:%u\n", \ + __FUNCTION__, __FILE__, __LINE__); \ +} while(0) +#else +#define ARP_NF_ASSERT(x) +#endif +#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) + +static DECLARE_MUTEX(arpt_mutex); + +#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) +#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) +#include <linux/netfilter_ipv4/lockhelp.h> +#include <linux/netfilter_ipv4/listhelp.h> + +struct arpt_table_info { + unsigned int size; + unsigned int number; + unsigned int initial_entries; + unsigned int hook_entry[NF_ARP_NUMHOOKS]; + unsigned int underflow[NF_ARP_NUMHOOKS]; + char entries[0] __attribute__((aligned(SMP_CACHE_BYTES))); +}; + +static LIST_HEAD(arpt_target); +static LIST_HEAD(arpt_tables); +#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) + +#ifdef CONFIG_SMP +#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) +#else +#define TABLE_OFFSET(t,p) 0 +#endif + +static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, + char *hdr_addr, int len) +{ + int i, ret; + + if (len > ARPT_DEV_ADDR_LEN_MAX) + len = ARPT_DEV_ADDR_LEN_MAX; + + ret = 0; + for (i = 0; i < len; i++) + ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i]; + + return (ret != 0); +} + +/* Returns whether packet matches rule or not. */ +static inline int arp_packet_match(const struct arphdr *arphdr, + struct net_device *dev, + const char *indev, + const char *outdev, + const struct arpt_arp *arpinfo) +{ + char *arpptr = (char *)(arphdr + 1); + char *src_devaddr, *tgt_devaddr; + u32 src_ipaddr, tgt_ipaddr; + int i, ret; + +#define FWINV(bool,invflg) ((bool) ^ !!(arpinfo->invflags & invflg)) + + if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop, + ARPT_INV_ARPOP)) { + dprintf("ARP operation field mismatch.\n"); + dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n", + arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask); + return 0; + } + + if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd, + ARPT_INV_ARPHRD)) { + dprintf("ARP hardware address format mismatch.\n"); + dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n", + arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask); + return 0; + } + + if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro, + ARPT_INV_ARPPRO)) { + dprintf("ARP protocol address format mismatch.\n"); + dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n", + arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask); + return 0; + } + + if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln, + ARPT_INV_ARPHLN)) { + dprintf("ARP hardware address length mismatch.\n"); + dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n", + arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask); + return 0; + } + + src_devaddr = arpptr; + arpptr += dev->addr_len; + memcpy(&src_ipaddr, arpptr, sizeof(u32)); + arpptr += sizeof(u32); + tgt_devaddr = arpptr; + arpptr += dev->addr_len; + memcpy(&tgt_ipaddr, arpptr, sizeof(u32)); + + if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len), + ARPT_INV_SRCDEVADDR) || + FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len), + ARPT_INV_TGTDEVADDR)) { + dprintf("Source or target device address mismatch.\n"); + + return 0; + } + + if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr, + ARPT_INV_SRCIP) || + FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr), + ARPT_INV_TGTIP)) { + dprintf("Source or target IP address mismatch.\n"); + + dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n", + NIPQUAD(src_ipaddr), + NIPQUAD(arpinfo->smsk.s_addr), + NIPQUAD(arpinfo->src.s_addr), + arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : ""); + dprintf("TGT: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n", + NIPQUAD(tgt_ipaddr), + NIPQUAD(arpinfo->tmsk.s_addr), + NIPQUAD(arpinfo->tgt.s_addr), + arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : ""); + return 0; + } + + /* Look for ifname matches. */ + for (i = 0, ret = 0; i < IFNAMSIZ; i++) { + ret |= (indev[i] ^ arpinfo->iniface[i]) + & arpinfo->iniface_mask[i]; + } + + if (FWINV(ret != 0, ARPT_INV_VIA_IN)) { + dprintf("VIA in mismatch (%s vs %s).%s\n", + indev, arpinfo->iniface, + arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":""); + return 0; + } + + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + unsigned long odev; + memcpy(&odev, outdev + i*sizeof(unsigned long), + sizeof(unsigned long)); + ret |= (odev + ^ ((const unsigned long *)arpinfo->outiface)[i]) + & ((const unsigned long *)arpinfo->outiface_mask)[i]; + } + + if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) { + dprintf("VIA out mismatch (%s vs %s).%s\n", + outdev, arpinfo->outiface, + arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":""); + return 0; + } + + return 1; +} + +static inline int arp_checkentry(const struct arpt_arp *arp) +{ + if (arp->flags & ~ARPT_F_MASK) { + duprintf("Unknown flag bits set: %08X\n", + arp->flags & ~ARPT_F_MASK); + return 0; + } + if (arp->invflags & ~ARPT_INV_MASK) { + duprintf("Unknown invflag bits set: %08X\n", + arp->invflags & ~ARPT_INV_MASK); + return 0; + } + + return 1; +} + +static unsigned int arpt_error(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + if (net_ratelimit()) + printk("arp_tables: error: '%s'\n", (char *)targinfo); + + return NF_DROP; +} + +static inline struct arpt_entry *get_entry(void *base, unsigned int offset) +{ + return (struct arpt_entry *)(base + offset); +} + +unsigned int arpt_do_table(struct sk_buff **pskb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + struct arpt_table *table, + void *userdata) +{ + static const char nulldevname[IFNAMSIZ]; + unsigned int verdict = NF_DROP; + struct arphdr *arp; + int hotdrop = 0; + struct arpt_entry *e, *back; + const char *indev, *outdev; + void *table_base; + + /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ + if (!pskb_may_pull((*pskb), (sizeof(struct arphdr) + + (2 * (*pskb)->dev->addr_len) + + (2 * sizeof(u32))))) + return NF_DROP; + + indev = in ? in->name : nulldevname; + outdev = out ? out->name : nulldevname; + + read_lock_bh(&table->lock); + table_base = (void *)table->private->entries + + TABLE_OFFSET(table->private, + smp_processor_id()); + e = get_entry(table_base, table->private->hook_entry[hook]); + back = get_entry(table_base, table->private->underflow[hook]); + + arp = (*pskb)->nh.arph; + do { + if (arp_packet_match(arp, (*pskb)->dev, indev, outdev, &e->arp)) { + struct arpt_entry_target *t; + int hdr_len; + + hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + + (2 * (*pskb)->dev->addr_len); + ADD_COUNTER(e->counters, hdr_len, 1); + + t = arpt_get_target(e); + + /* Standard target? */ + if (!t->u.kernel.target->target) { + int v; + + v = ((struct arpt_standard_target *)t)->verdict; + if (v < 0) { + /* Pop from stack? */ + if (v != ARPT_RETURN) { + verdict = (unsigned)(-v) - 1; + break; + } + e = back; + back = get_entry(table_base, + back->comefrom); + continue; + } + if (table_base + v + != (void *)e + e->next_offset) { + /* Save old back ptr in next entry */ + struct arpt_entry *next + = (void *)e + e->next_offset; + next->comefrom = + (void *)back - table_base; + + /* set back pointer to next entry */ + back = next; + } + + e = get_entry(table_base, v); + } else { + /* Targets which reenter must return + * abs. verdicts + */ + verdict = t->u.kernel.target->target(pskb, + hook, + in, out, + t->data, + userdata); + + /* Target might have changed stuff. */ + arp = (*pskb)->nh.arph; + + if (verdict == ARPT_CONTINUE) + e = (void *)e + e->next_offset; + else + /* Verdict */ + break; + } + } else { + e = (void *)e + e->next_offset; + } + } while (!hotdrop); + read_unlock_bh(&table->lock); + + if (hotdrop) + return NF_DROP; + else + return verdict; +} + +static inline void *find_inlist_lock_noload(struct list_head *head, + const char *name, + int *error, + struct semaphore *mutex) +{ + void *ret; + + *error = down_interruptible(mutex); + if (*error != 0) + return NULL; + + ret = list_named_find(head, name); + if (!ret) { + *error = -ENOENT; + up(mutex); + } + return ret; +} + +#ifndef CONFIG_KMOD +#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m)) +#else +static void * +find_inlist_lock(struct list_head *head, + const char *name, + const char *prefix, + int *error, + struct semaphore *mutex) +{ + void *ret; + + ret = find_inlist_lock_noload(head, name, error, mutex); + if (!ret) { + duprintf("find_inlist: loading `%s%s'.\n", prefix, name); + request_module("%s%s", prefix, name); + ret = find_inlist_lock_noload(head, name, error, mutex); + } + + return ret; +} +#endif + +static inline struct arpt_table *arpt_find_table_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&arpt_tables, name, "arptable_", error, mutex); +} + +static struct arpt_target *arpt_find_target_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&arpt_target, name, "arpt_", error, mutex); +} + +/* All zeroes == unconditional rule. */ +static inline int unconditional(const struct arpt_arp *arp) +{ + unsigned int i; + + for (i = 0; i < sizeof(*arp)/sizeof(__u32); i++) + if (((__u32 *)arp)[i]) + return 0; + + return 1; +} + +/* Figures out from what hook each rule can be called: returns 0 if + * there are loops. Puts hook bitmask in comefrom. + */ +static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks) +{ + unsigned int hook; + + /* No recursion; use packet counter to save back ptrs (reset + * to 0 as we leave), and comefrom to save source hook bitmask. + */ + for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { + unsigned int pos = newinfo->hook_entry[hook]; + struct arpt_entry *e + = (struct arpt_entry *)(newinfo->entries + pos); + + if (!(valid_hooks & (1 << hook))) + continue; + + /* Set initial back pointer. */ + e->counters.pcnt = pos; + + for (;;) { + struct arpt_standard_target *t + = (void *)arpt_get_target(e); + + if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) { + printk("arptables: loop hook %u pos %u %08X.\n", + hook, pos, e->comefrom); + return 0; + } + e->comefrom + |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); + + /* Unconditional return/END. */ + if (e->target_offset == sizeof(struct arpt_entry) + && (strcmp(t->target.u.user.name, + ARPT_STANDARD_TARGET) == 0) + && t->verdict < 0 + && unconditional(&e->arp)) { + unsigned int oldpos, size; + + /* Return: backtrack through the last + * big jump. + */ + do { + e->comefrom ^= (1<<NF_ARP_NUMHOOKS); + oldpos = pos; + pos = e->counters.pcnt; + e->counters.pcnt = 0; + + /* We're at the start. */ + if (pos == oldpos) + goto next; + + e = (struct arpt_entry *) + (newinfo->entries + pos); + } while (oldpos == pos + e->next_offset); + + /* Move along one */ + size = e->next_offset; + e = (struct arpt_entry *) + (newinfo->entries + pos + size); + e->counters.pcnt = pos; + pos += size; + } else { + int newpos = t->verdict; + + if (strcmp(t->target.u.user.name, + ARPT_STANDARD_TARGET) == 0 + && newpos >= 0) { + /* This a jump; chase it. */ + duprintf("Jump rule %u -> %u\n", + pos, newpos); + } else { + /* ... this is a fallthru */ + newpos = pos + e->next_offset; + } + e = (struct arpt_entry *) + (newinfo->entries + newpos); + e->counters.pcnt = pos; + pos = newpos; + } + } + next: + duprintf("Finished chain %u\n", hook); + } + return 1; +} + +static inline int standard_check(const struct arpt_entry_target *t, + unsigned int max_offset) +{ + struct arpt_standard_target *targ = (void *)t; + + /* Check standard info. */ + if (t->u.target_size + != ARPT_ALIGN(sizeof(struct arpt_standard_target))) { + duprintf("arpt_standard_check: target size %u != %Zu\n", + t->u.target_size, + ARPT_ALIGN(sizeof(struct arpt_standard_target))); + return 0; + } + + if (targ->verdict >= 0 + && targ->verdict > max_offset - sizeof(struct arpt_entry)) { + duprintf("arpt_standard_check: bad verdict (%i)\n", + targ->verdict); + return 0; + } + + if (targ->verdict < -NF_MAX_VERDICT - 1) { + duprintf("arpt_standard_check: bad negative verdict (%i)\n", + targ->verdict); + return 0; + } + return 1; +} + +static struct arpt_target arpt_standard_target; + +static inline int check_entry(struct arpt_entry *e, const char *name, unsigned int size, + unsigned int *i) +{ + struct arpt_entry_target *t; + struct arpt_target *target; + int ret; + + if (!arp_checkentry(&e->arp)) { + duprintf("arp_tables: arp check failed %p %s.\n", e, name); + return -EINVAL; + } + + t = arpt_get_target(e); + target = arpt_find_target_lock(t->u.user.name, &ret, &arpt_mutex); + if (!target) { + duprintf("check_entry: `%s' not found\n", t->u.user.name); + goto out; + } + if (!try_module_get((target->me))) { + ret = -ENOENT; + goto out_unlock; + } + t->u.kernel.target = target; + up(&arpt_mutex); + + if (t->u.kernel.target == &arpt_standard_target) { + if (!standard_check(t, size)) { + ret = -EINVAL; + goto out; + } + } else if (t->u.kernel.target->checkentry + && !t->u.kernel.target->checkentry(name, e, t->data, + t->u.target_size + - sizeof(*t), + e->comefrom)) { + module_put(t->u.kernel.target->me); + duprintf("arp_tables: check failed for `%s'.\n", + t->u.kernel.target->name); + ret = -EINVAL; + goto out; + } + + (*i)++; + return 0; + +out_unlock: + up(&arpt_mutex); +out: + return ret; +} + +static inline int check_entry_size_and_hooks(struct arpt_entry *e, + struct arpt_table_info *newinfo, + unsigned char *base, + unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + unsigned int *i) +{ + unsigned int h; + + if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 + || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { + duprintf("Bad offset %p\n", e); + return -EINVAL; + } + + if (e->next_offset + < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* Check hooks & underflows */ + for (h = 0; h < NF_ARP_NUMHOOKS; h++) { + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) + newinfo->underflow[h] = underflows[h]; + } + + /* FIXME: underflows must be unconditional, standard verdicts + < 0 (not ARPT_RETURN). --RR */ + + /* Clear counters and comefrom */ + e->counters = ((struct arpt_counters) { 0, 0 }); + e->comefrom = 0; + + (*i)++; + return 0; +} + +static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) +{ + struct arpt_entry_target *t; + + if (i && (*i)-- == 0) + return 1; + + t = arpt_get_target(e); + if (t->u.kernel.target->destroy) + t->u.kernel.target->destroy(t->data, + t->u.target_size - sizeof(*t)); + module_put(t->u.kernel.target->me); + return 0; +} + +/* Checks and translates the user-supplied table segment (held in + * newinfo). + */ +static int translate_table(const char *name, + unsigned int valid_hooks, + struct arpt_table_info *newinfo, + unsigned int size, + unsigned int number, + const unsigned int *hook_entries, + const unsigned int *underflows) +{ + unsigned int i; + int ret; + + newinfo->size = size; + newinfo->number = number; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + newinfo->hook_entry[i] = 0xFFFFFFFF; + newinfo->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_table: size %u\n", newinfo->size); + i = 0; + + /* Walk through entries, checking offsets. */ + ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry_size_and_hooks, + newinfo, + newinfo->entries, + newinfo->entries + size, + hook_entries, underflows, &i); + duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); + if (ret != 0) + return ret; + + if (i != number) { + duprintf("translate_table: %u not %u entries\n", + i, number); + return -EINVAL; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(valid_hooks & (1 << i))) + continue; + if (newinfo->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, hook_entries[i]); + return -EINVAL; + } + if (newinfo->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, underflows[i]); + return -EINVAL; + } + } + + if (!mark_source_chains(newinfo, valid_hooks)) { + duprintf("Looping hook\n"); + return -ELOOP; + } + + /* Finally, each sanity check must pass */ + i = 0; + ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry, name, size, &i); + + if (ret != 0) { + ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + cleanup_entry, &i); + return ret; + } + + /* And one copy for every other CPU */ + for (i = 1; i < num_possible_cpus(); i++) { + memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i, + newinfo->entries, + SMP_ALIGN(newinfo->size)); + } + + return ret; +} + +static struct arpt_table_info *replace_table(struct arpt_table *table, + unsigned int num_counters, + struct arpt_table_info *newinfo, + int *error) +{ + struct arpt_table_info *oldinfo; + + /* Do the substitution. */ + write_lock_bh(&table->lock); + /* Check inside lock: is the old number correct? */ + if (num_counters != table->private->number) { + duprintf("num_counters != table->private->number (%u/%u)\n", + num_counters, table->private->number); + write_unlock_bh(&table->lock); + *error = -EAGAIN; + return NULL; + } + oldinfo = table->private; + table->private = newinfo; + newinfo->initial_entries = oldinfo->initial_entries; + write_unlock_bh(&table->lock); + + return oldinfo; +} + +/* Gets counters. */ +static inline int add_entry_to_counter(const struct arpt_entry *e, + struct arpt_counters total[], + unsigned int *i) +{ + ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + +static void get_counters(const struct arpt_table_info *t, + struct arpt_counters counters[]) +{ + unsigned int cpu; + unsigned int i; + + for (cpu = 0; cpu < num_possible_cpus(); cpu++) { + i = 0; + ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + t->size, + add_entry_to_counter, + counters, + &i); + } +} + +static int copy_entries_to_user(unsigned int total_size, + struct arpt_table *table, + void __user *userptr) +{ + unsigned int off, num, countersize; + struct arpt_entry *e; + struct arpt_counters *counters; + int ret = 0; + + /* We need atomic snapshot of counters: rest doesn't change + * (other than comefrom, which userspace doesn't care + * about). + */ + countersize = sizeof(struct arpt_counters) * table->private->number; + counters = vmalloc(countersize); + + if (counters == NULL) + return -ENOMEM; + + /* First, sum counters... */ + memset(counters, 0, countersize); + write_lock_bh(&table->lock); + get_counters(table->private, counters); + write_unlock_bh(&table->lock); + + /* ... then copy entire thing from CPU 0... */ + if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + ret = -EFAULT; + goto free_counters; + } + + /* FIXME: use iterator macros --RR */ + /* ... then go back and fix counters and names */ + for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ + struct arpt_entry_target *t; + + e = (struct arpt_entry *)(table->private->entries + off); + if (copy_to_user(userptr + off + + offsetof(struct arpt_entry, counters), + &counters[num], + sizeof(counters[num])) != 0) { + ret = -EFAULT; + goto free_counters; + } + + t = arpt_get_target(e); + if (copy_to_user(userptr + off + e->target_offset + + offsetof(struct arpt_entry_target, + u.user.name), + t->u.kernel.target->name, + strlen(t->u.kernel.target->name)+1) != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + free_counters: + vfree(counters); + return ret; +} + +static int get_entries(const struct arpt_get_entries *entries, + struct arpt_get_entries __user *uptr) +{ + int ret; + struct arpt_table *t; + + t = arpt_find_table_lock(entries->name, &ret, &arpt_mutex); + if (t) { + duprintf("t->private->number = %u\n", + t->private->number); + if (entries->size == t->private->size) + ret = copy_entries_to_user(t->private->size, + t, uptr->entrytable); + else { + duprintf("get_entries: I've got %u not %u!\n", + t->private->size, + entries->size); + ret = -EINVAL; + } + up(&arpt_mutex); + } else + duprintf("get_entries: Can't find %s!\n", + entries->name); + + return ret; +} + +static int do_replace(void __user *user, unsigned int len) +{ + int ret; + struct arpt_replace tmp; + struct arpt_table *t; + struct arpt_table_info *newinfo, *oldinfo; + struct arpt_counters *counters; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* Hack: Causes ipchains to give correct error msg --RR */ + if (len != sizeof(tmp) + tmp.size) + return -ENOPROTOOPT; + + /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ + if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) + return -ENOMEM; + + newinfo = vmalloc(sizeof(struct arpt_table_info) + + SMP_ALIGN(tmp.size) * num_possible_cpus()); + if (!newinfo) + return -ENOMEM; + + if (copy_from_user(newinfo->entries, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + counters = vmalloc(tmp.num_counters * sizeof(struct arpt_counters)); + if (!counters) { + ret = -ENOMEM; + goto free_newinfo; + } + memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters)); + + ret = translate_table(tmp.name, tmp.valid_hooks, + newinfo, tmp.size, tmp.num_entries, + tmp.hook_entry, tmp.underflow); + if (ret != 0) + goto free_newinfo_counters; + + duprintf("arp_tables: Translated table\n"); + + t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex); + if (!t) + goto free_newinfo_counters_untrans; + + /* You lied! */ + if (tmp.valid_hooks != t->valid_hooks) { + duprintf("Valid hook crap: %08X vs %08X\n", + tmp.valid_hooks, t->valid_hooks); + ret = -EINVAL; + goto free_newinfo_counters_untrans_unlock; + } + + /* Get a reference in advance, we're not allowed fail later */ + if (!try_module_get(t->me)) { + ret = -EBUSY; + goto free_newinfo_counters_untrans_unlock; + } + + oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret); + if (!oldinfo) + goto put_module; + + /* Update module usage count based on number of rules */ + duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", + oldinfo->number, oldinfo->initial_entries, newinfo->number); + if ((oldinfo->number > oldinfo->initial_entries) || + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + if ((oldinfo->number > oldinfo->initial_entries) && + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + + /* Get the old counters. */ + get_counters(oldinfo, counters); + /* Decrease module usage counts and free resource */ + ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); + vfree(oldinfo); + if (copy_to_user(tmp.counters, counters, + sizeof(struct arpt_counters) * tmp.num_counters) != 0) + ret = -EFAULT; + vfree(counters); + up(&arpt_mutex); + return ret; + + put_module: + module_put(t->me); + free_newinfo_counters_untrans_unlock: + up(&arpt_mutex); + free_newinfo_counters_untrans: + ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL); + free_newinfo_counters: + vfree(counters); + free_newinfo: + vfree(newinfo); + return ret; +} + +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. + */ +static inline int add_counter_to_entry(struct arpt_entry *e, + const struct arpt_counters addme[], + unsigned int *i) +{ + + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + +static int do_add_counters(void __user *user, unsigned int len) +{ + unsigned int i; + struct arpt_counters_info tmp, *paddc; + struct arpt_table *t; + int ret; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct arpt_counters)) + return -EINVAL; + + paddc = vmalloc(len); + if (!paddc) + return -ENOMEM; + + if (copy_from_user(paddc, user, len) != 0) { + ret = -EFAULT; + goto free; + } + + t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex); + if (!t) + goto free; + + write_lock_bh(&t->lock); + if (t->private->number != paddc->num_counters) { + ret = -EINVAL; + goto unlock_up_free; + } + + i = 0; + ARPT_ENTRY_ITERATE(t->private->entries, + t->private->size, + add_counter_to_entry, + paddc->counters, + &i); + unlock_up_free: + write_unlock_bh(&t->lock); + up(&arpt_mutex); + free: + vfree(paddc); + + return ret; +} + +static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ARPT_SO_SET_REPLACE: + ret = do_replace(user, len); + break; + + case ARPT_SO_SET_ADD_COUNTERS: + ret = do_add_counters(user, len); + break; + + default: + duprintf("do_arpt_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ARPT_SO_GET_INFO: { + char name[ARPT_TABLE_MAXNAMELEN]; + struct arpt_table *t; + + if (*len != sizeof(struct arpt_getinfo)) { + duprintf("length %u != %Zu\n", *len, + sizeof(struct arpt_getinfo)); + ret = -EINVAL; + break; + } + + if (copy_from_user(name, user, sizeof(name)) != 0) { + ret = -EFAULT; + break; + } + name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; + t = arpt_find_table_lock(name, &ret, &arpt_mutex); + if (t) { + struct arpt_getinfo info; + + info.valid_hooks = t->valid_hooks; + memcpy(info.hook_entry, t->private->hook_entry, + sizeof(info.hook_entry)); + memcpy(info.underflow, t->private->underflow, + sizeof(info.underflow)); + info.num_entries = t->private->number; + info.size = t->private->size; + strcpy(info.name, name); + + if (copy_to_user(user, &info, *len) != 0) + ret = -EFAULT; + else + ret = 0; + + up(&arpt_mutex); + } + } + break; + + case ARPT_SO_GET_ENTRIES: { + struct arpt_get_entries get; + + if (*len < sizeof(get)) { + duprintf("get_entries: %u < %Zu\n", *len, sizeof(get)); + ret = -EINVAL; + } else if (copy_from_user(&get, user, sizeof(get)) != 0) { + ret = -EFAULT; + } else if (*len != sizeof(struct arpt_get_entries) + get.size) { + duprintf("get_entries: %u != %Zu\n", *len, + sizeof(struct arpt_get_entries) + get.size); + ret = -EINVAL; + } else + ret = get_entries(&get, user); + break; + } + + default: + duprintf("do_arpt_get_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +/* Registration hooks for targets. */ +int arpt_register_target(struct arpt_target *target) +{ + int ret; + + ret = down_interruptible(&arpt_mutex); + if (ret != 0) + return ret; + + if (!list_named_insert(&arpt_target, target)) { + duprintf("arpt_register_target: `%s' already in list!\n", + target->name); + ret = -EINVAL; + } + up(&arpt_mutex); + return ret; +} + +void arpt_unregister_target(struct arpt_target *target) +{ + down(&arpt_mutex); + LIST_DELETE(&arpt_target, target); + up(&arpt_mutex); +} + +int arpt_register_table(struct arpt_table *table, + const struct arpt_replace *repl) +{ + int ret; + struct arpt_table_info *newinfo; + static struct arpt_table_info bootstrap + = { 0, 0, 0, { 0 }, { 0 }, { } }; + + newinfo = vmalloc(sizeof(struct arpt_table_info) + + SMP_ALIGN(repl->size) * num_possible_cpus()); + if (!newinfo) { + ret = -ENOMEM; + return ret; + } + memcpy(newinfo->entries, repl->entries, repl->size); + + ret = translate_table(table->name, table->valid_hooks, + newinfo, repl->size, + repl->num_entries, + repl->hook_entry, + repl->underflow); + duprintf("arpt_register_table: translate table gives %d\n", ret); + if (ret != 0) { + vfree(newinfo); + return ret; + } + + ret = down_interruptible(&arpt_mutex); + if (ret != 0) { + vfree(newinfo); + return ret; + } + + /* Don't autoload: we'd eat our tail... */ + if (list_named_find(&arpt_tables, table->name)) { + ret = -EEXIST; + goto free_unlock; + } + + /* Simplifies replace_table code. */ + table->private = &bootstrap; + if (!replace_table(table, 0, newinfo, &ret)) + goto free_unlock; + + duprintf("table->private->number = %u\n", + table->private->number); + + /* save number of initial entries */ + table->private->initial_entries = table->private->number; + + rwlock_init(&table->lock); + list_prepend(&arpt_tables, table); + + unlock: + up(&arpt_mutex); + return ret; + + free_unlock: + vfree(newinfo); + goto unlock; +} + +void arpt_unregister_table(struct arpt_table *table) +{ + down(&arpt_mutex); + LIST_DELETE(&arpt_tables, table); + up(&arpt_mutex); + + /* Decrease module usage counts and free resources */ + ARPT_ENTRY_ITERATE(table->private->entries, table->private->size, + cleanup_entry, NULL); + vfree(table->private); +} + +/* The built-in targets: standard (NULL) and error. */ +static struct arpt_target arpt_standard_target = { + .name = ARPT_STANDARD_TARGET, +}; + +static struct arpt_target arpt_error_target = { + .name = ARPT_ERROR_TARGET, + .target = arpt_error, +}; + +static struct nf_sockopt_ops arpt_sockopts = { + .pf = PF_INET, + .set_optmin = ARPT_BASE_CTL, + .set_optmax = ARPT_SO_SET_MAX+1, + .set = do_arpt_set_ctl, + .get_optmin = ARPT_BASE_CTL, + .get_optmax = ARPT_SO_GET_MAX+1, + .get = do_arpt_get_ctl, +}; + +#ifdef CONFIG_PROC_FS +static inline int print_name(const struct arpt_table *t, + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) +{ + if ((*count)++ >= start_offset) { + unsigned int namelen; + + namelen = sprintf(buffer + *pos, "%s\n", t->name); + if (*pos + namelen > length) { + /* Stop iterating */ + return 1; + } + *pos += namelen; + } + return 0; +} + +static int arpt_get_tables(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&arpt_mutex) != 0) + return 0; + + LIST_FIND(&arpt_tables, print_name, struct arpt_table *, + offset, buffer, length, &pos, &count); + + up(&arpt_mutex); + + /* `start' hack - see fs/proc/generic.c line ~105 */ + *start=(char *)((unsigned long)count-offset); + return pos; +} +#endif /*CONFIG_PROC_FS*/ + +static int __init init(void) +{ + int ret; + + /* Noone else will be downing sem now, so we won't sleep */ + down(&arpt_mutex); + list_append(&arpt_target, &arpt_standard_target); + list_append(&arpt_target, &arpt_error_target); + up(&arpt_mutex); + + /* Register setsockopt */ + ret = nf_register_sockopt(&arpt_sockopts); + if (ret < 0) { + duprintf("Unable to register sockopts.\n"); + return ret; + } + +#ifdef CONFIG_PROC_FS + { + struct proc_dir_entry *proc; + + proc = proc_net_create("arp_tables_names", 0, arpt_get_tables); + if (!proc) { + nf_unregister_sockopt(&arpt_sockopts); + return -ENOMEM; + } + proc->owner = THIS_MODULE; + } +#endif + + printk("arp_tables: (C) 2002 David S. Miller\n"); + return 0; +} + +static void __exit fini(void) +{ + nf_unregister_sockopt(&arpt_sockopts); +#ifdef CONFIG_PROC_FS + proc_net_remove("arp_tables_names"); +#endif +} + +EXPORT_SYMBOL(arpt_register_table); +EXPORT_SYMBOL(arpt_unregister_table); +EXPORT_SYMBOL(arpt_do_table); +EXPORT_SYMBOL(arpt_register_target); +EXPORT_SYMBOL(arpt_unregister_target); + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c new file mode 100644 index 000000000000..3e592ec86482 --- /dev/null +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -0,0 +1,104 @@ +/* module that allows mangling of the arp payload */ +#include <linux/module.h> +#include <linux/netfilter_arp/arpt_mangle.h> +#include <net/sock.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); +MODULE_DESCRIPTION("arptables arp payload mangle target"); + +static unsigned int +target(struct sk_buff **pskb, unsigned int hooknum, const struct net_device *in, + const struct net_device *out, const void *targinfo, void *userinfo) +{ + const struct arpt_mangle *mangle = targinfo; + struct arphdr *arp; + unsigned char *arpptr; + int pln, hln; + + if (skb_shared(*pskb) || skb_cloned(*pskb)) { + struct sk_buff *nskb; + + nskb = skb_copy(*pskb, GFP_ATOMIC); + if (!nskb) + return NF_DROP; + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = nskb; + } + + arp = (*pskb)->nh.arph; + arpptr = (*pskb)->nh.raw + sizeof(*arp); + pln = arp->ar_pln; + hln = arp->ar_hln; + /* We assume that pln and hln were checked in the match */ + if (mangle->flags & ARPT_MANGLE_SDEV) { + if (ARPT_DEV_ADDR_LEN_MAX < hln || + (arpptr + hln > (**pskb).tail)) + return NF_DROP; + memcpy(arpptr, mangle->src_devaddr, hln); + } + arpptr += hln; + if (mangle->flags & ARPT_MANGLE_SIP) { + if (ARPT_MANGLE_ADDR_LEN_MAX < pln || + (arpptr + pln > (**pskb).tail)) + return NF_DROP; + memcpy(arpptr, &mangle->u_s.src_ip, pln); + } + arpptr += pln; + if (mangle->flags & ARPT_MANGLE_TDEV) { + if (ARPT_DEV_ADDR_LEN_MAX < hln || + (arpptr + hln > (**pskb).tail)) + return NF_DROP; + memcpy(arpptr, mangle->tgt_devaddr, hln); + } + arpptr += hln; + if (mangle->flags & ARPT_MANGLE_TIP) { + if (ARPT_MANGLE_ADDR_LEN_MAX < pln || + (arpptr + pln > (**pskb).tail)) + return NF_DROP; + memcpy(arpptr, &mangle->u_t.tgt_ip, pln); + } + return mangle->target; +} + +static int +checkentry(const char *tablename, const struct arpt_entry *e, void *targinfo, + unsigned int targinfosize, unsigned int hook_mask) +{ + const struct arpt_mangle *mangle = targinfo; + + if (mangle->flags & ~ARPT_MANGLE_MASK || + !(mangle->flags & ARPT_MANGLE_MASK)) + return 0; + + if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT && + mangle->target != ARPT_CONTINUE) + return 0; + return 1; +} + +static struct arpt_target arpt_mangle_reg += { + .name = "mangle", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + if (arpt_register_target(&arpt_mangle_reg)) + return -EINVAL; + + return 0; +} + +static void __exit fini(void) +{ + arpt_unregister_target(&arpt_mangle_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c new file mode 100644 index 000000000000..0d759f5a4ef0 --- /dev/null +++ b/net/ipv4/netfilter/arptable_filter.c @@ -0,0 +1,214 @@ +/* + * Filtering ARP tables module. + * + * Copyright (C) 2002 David S. Miller (davem@redhat.com) + * + */ + +#include <linux/module.h> +#include <linux/netfilter_arp/arp_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); +MODULE_DESCRIPTION("arptables filter table"); + +#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \ + (1 << NF_ARP_FORWARD)) + +/* Standard entry. */ +struct arpt_standard +{ + struct arpt_entry entry; + struct arpt_standard_target target; +}; + +struct arpt_error_target +{ + struct arpt_entry_target target; + char errorname[ARPT_FUNCTION_MAXNAMELEN]; +}; + +struct arpt_error +{ + struct arpt_entry entry; + struct arpt_error_target target; +}; + +static struct +{ + struct arpt_replace repl; + struct arpt_standard entries[3]; + struct arpt_error term; +} initial_table __initdata += { { "filter", FILTER_VALID_HOOKS, 4, + sizeof(struct arpt_standard) * 3 + sizeof(struct arpt_error), + { [NF_ARP_IN] = 0, + [NF_ARP_OUT] = sizeof(struct arpt_standard), + [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), }, + { [NF_ARP_IN] = 0, + [NF_ARP_OUT] = sizeof(struct arpt_standard), + [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), }, + 0, NULL, { } }, + { + /* ARP_IN */ + { + { + { + { 0 }, { 0 }, { 0 }, { 0 }, + 0, 0, + { { 0, }, { 0, } }, + { { 0, }, { 0, } }, + 0, 0, + 0, 0, + 0, 0, + "", "", { 0 }, { 0 }, + 0, 0 + }, + sizeof(struct arpt_entry), + sizeof(struct arpt_standard), + 0, + { 0, 0 }, { } }, + { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } + }, + /* ARP_OUT */ + { + { + { + { 0 }, { 0 }, { 0 }, { 0 }, + 0, 0, + { { 0, }, { 0, } }, + { { 0, }, { 0, } }, + 0, 0, + 0, 0, + 0, 0, + "", "", { 0 }, { 0 }, + 0, 0 + }, + sizeof(struct arpt_entry), + sizeof(struct arpt_standard), + 0, + { 0, 0 }, { } }, + { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } + }, + /* ARP_FORWARD */ + { + { + { + { 0 }, { 0 }, { 0 }, { 0 }, + 0, 0, + { { 0, }, { 0, } }, + { { 0, }, { 0, } }, + 0, 0, + 0, 0, + 0, 0, + "", "", { 0 }, { 0 }, + 0, 0 + }, + sizeof(struct arpt_entry), + sizeof(struct arpt_standard), + 0, + { 0, 0 }, { } }, + { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } + } + }, + /* ERROR */ + { + { + { + { 0 }, { 0 }, { 0 }, { 0 }, + 0, 0, + { { 0, }, { 0, } }, + { { 0, }, { 0, } }, + 0, 0, + 0, 0, + 0, 0, + "", "", { 0 }, { 0 }, + 0, 0 + }, + sizeof(struct arpt_entry), + sizeof(struct arpt_error), + 0, + { 0, 0 }, { } }, + { { { { ARPT_ALIGN(sizeof(struct arpt_error_target)), ARPT_ERROR_TARGET } }, + { } }, + "ERROR" + } + } +}; + +static struct arpt_table packet_filter = { + .name = "filter", + .valid_hooks = FILTER_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .private = NULL, + .me = THIS_MODULE, +}; + +/* The work comes in here from netfilter.c */ +static unsigned int arpt_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return arpt_do_table(pskb, hook, in, out, &packet_filter, NULL); +} + +static struct nf_hook_ops arpt_ops[] = { + { + .hook = arpt_hook, + .owner = THIS_MODULE, + .pf = NF_ARP, + .hooknum = NF_ARP_IN, + }, + { + .hook = arpt_hook, + .owner = THIS_MODULE, + .pf = NF_ARP, + .hooknum = NF_ARP_OUT, + }, + { + .hook = arpt_hook, + .owner = THIS_MODULE, + .pf = NF_ARP, + .hooknum = NF_ARP_FORWARD, + }, +}; + +static int __init init(void) +{ + int ret, i; + + /* Register table */ + ret = arpt_register_table(&packet_filter, &initial_table.repl); + if (ret < 0) + return ret; + + for (i = 0; i < ARRAY_SIZE(arpt_ops); i++) + if ((ret = nf_register_hook(&arpt_ops[i])) < 0) + goto cleanup_hooks; + return ret; + +cleanup_hooks: + while (--i >= 0) + nf_unregister_hook(&arpt_ops[i]); + + arpt_unregister_table(&packet_filter); + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(arpt_ops); i++) + nf_unregister_hook(&arpt_ops[i]); + + arpt_unregister_table(&packet_filter); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c new file mode 100644 index 000000000000..3dbddd062605 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -0,0 +1,167 @@ +/* Amanda extension for IP connection tracking, Version 0.2 + * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> + * based on HW's ip_conntrack_irc.c as well as other modules + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Module load syntax: + * insmod ip_conntrack_amanda.o [master_timeout=n] + * + * Where master_timeout is the timeout (in seconds) of the master + * connection (port 10080). This defaults to 5 minutes but if + * your clients take longer than 5 minutes to do their work + * before getting back to the Amanda server, you can increase + * this value. + * + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/moduleparam.h> +#include <net/checksum.h> +#include <net/udp.h> + +#include <linux/netfilter_ipv4/lockhelp.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_amanda.h> + +static unsigned int master_timeout = 300; + +MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); +MODULE_DESCRIPTION("Amanda connection tracking module"); +MODULE_LICENSE("GPL"); +module_param(master_timeout, int, 0600); +MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); + +static char *conns[] = { "DATA ", "MESG ", "INDEX " }; + +/* This is slow, but it's simple. --RR */ +static char amanda_buffer[65536]; +static DECLARE_LOCK(amanda_buffer_lock); + +unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp); +EXPORT_SYMBOL_GPL(ip_nat_amanda_hook); + +static int help(struct sk_buff **pskb, + struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) +{ + struct ip_conntrack_expect *exp; + char *data, *data_limit, *tmp; + unsigned int dataoff, i; + u_int16_t port, len; + int ret = NF_ACCEPT; + + /* Only look at packets from the Amanda server */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + /* increase the UDP timeout of the master connection as replies from + * Amanda clients to the server can be quite delayed */ + ip_ct_refresh_acct(ct, ctinfo, NULL, master_timeout * HZ); + + /* No data? */ + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + if (dataoff >= (*pskb)->len) { + if (net_ratelimit()) + printk("amanda_help: skblen = %u\n", (*pskb)->len); + return NF_ACCEPT; + } + + LOCK_BH(&amanda_buffer_lock); + skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff); + data = amanda_buffer; + data_limit = amanda_buffer + (*pskb)->len - dataoff; + *data_limit = '\0'; + + /* Search for the CONNECT string */ + data = strstr(data, "CONNECT "); + if (!data) + goto out; + data += strlen("CONNECT "); + + /* Only search first line. */ + if ((tmp = strchr(data, '\n'))) + *tmp = '\0'; + + for (i = 0; i < ARRAY_SIZE(conns); i++) { + char *match = strstr(data, conns[i]); + if (!match) + continue; + tmp = data = match + strlen(conns[i]); + port = simple_strtoul(data, &data, 10); + len = data - tmp; + if (port == 0 || len > 5) + break; + + exp = ip_conntrack_expect_alloc(); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + + exp->expectfn = NULL; + exp->master = ct; + + exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; + exp->tuple.src.u.tcp.port = 0; + exp->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip; + exp->tuple.dst.protonum = IPPROTO_TCP; + exp->tuple.dst.u.tcp.port = htons(port); + + exp->mask.src.ip = 0xFFFFFFFF; + exp->mask.src.u.tcp.port = 0; + exp->mask.dst.ip = 0xFFFFFFFF; + exp->mask.dst.protonum = 0xFF; + exp->mask.dst.u.tcp.port = 0xFFFF; + + if (ip_nat_amanda_hook) + ret = ip_nat_amanda_hook(pskb, ctinfo, + tmp - amanda_buffer, + len, exp); + else if (ip_conntrack_expect_related(exp) != 0) { + ip_conntrack_expect_free(exp); + ret = NF_DROP; + } + } + +out: + UNLOCK_BH(&amanda_buffer_lock); + return ret; +} + +static struct ip_conntrack_helper amanda_helper = { + .max_expected = ARRAY_SIZE(conns), + .timeout = 180, + .me = THIS_MODULE, + .help = help, + .name = "amanda", + + .tuple = { .src = { .u = { __constant_htons(10080) } }, + .dst = { .protonum = IPPROTO_UDP }, + }, + .mask = { .src = { .u = { 0xFFFF } }, + .dst = { .protonum = 0xFF }, + }, +}; + +static void __exit fini(void) +{ + ip_conntrack_helper_unregister(&amanda_helper); +} + +static int __init init(void) +{ + return ip_conntrack_helper_register(&amanda_helper); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c new file mode 100644 index 000000000000..28d9425d5c39 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -0,0 +1,1247 @@ +/* Connection state tracking for netfilter. This is separated from, + but required by, the NAT layer; it can also be used by an iptables + extension. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 23 Apr 2001: Harald Welte <laforge@gnumonks.org> + * - new API and handling of conntrack/nat helpers + * - now capable of multiple expectations for one master + * 16 Jul 2002: Harald Welte <laforge@gnumonks.org> + * - add usage/reference counts to ip_conntrack_expect + * - export ip_conntrack[_expect]_{find_get,put} functions + * */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/icmp.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/vmalloc.h> +#include <net/checksum.h> +#include <net/ip.h> +#include <linux/stddef.h> +#include <linux/sysctl.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/jhash.h> +#include <linux/err.h> +#include <linux/percpu.h> +#include <linux/moduleparam.h> + +/* This rwlock protects the main hash table, protocol/helper/expected + registrations, conntrack timers*/ +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) + +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#define IP_CONNTRACK_VERSION "2.1" + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DECLARE_RWLOCK(ip_conntrack_lock); + +/* ip_conntrack_standalone needs this */ +atomic_t ip_conntrack_count = ATOMIC_INIT(0); + +void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; +LIST_HEAD(ip_conntrack_expect_list); +struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; +static LIST_HEAD(helpers); +unsigned int ip_conntrack_htable_size = 0; +int ip_conntrack_max; +struct list_head *ip_conntrack_hash; +static kmem_cache_t *ip_conntrack_cachep; +static kmem_cache_t *ip_conntrack_expect_cachep; +struct ip_conntrack ip_conntrack_untracked; +unsigned int ip_ct_log_invalid; +static LIST_HEAD(unconfirmed); +static int ip_conntrack_vmalloc; + +DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); + +void +ip_conntrack_put(struct ip_conntrack *ct) +{ + IP_NF_ASSERT(ct); + nf_conntrack_put(&ct->ct_general); +} + +static int ip_conntrack_hash_rnd_initted; +static unsigned int ip_conntrack_hash_rnd; + +static u_int32_t +hash_conntrack(const struct ip_conntrack_tuple *tuple) +{ +#if 0 + dump_tuple(tuple); +#endif + return (jhash_3words(tuple->src.ip, + (tuple->dst.ip ^ tuple->dst.protonum), + (tuple->src.u.all | (tuple->dst.u.all << 16)), + ip_conntrack_hash_rnd) % ip_conntrack_htable_size); +} + +int +ip_ct_get_tuple(const struct iphdr *iph, + const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_protocol *protocol) +{ + /* Never happen */ + if (iph->frag_off & htons(IP_OFFSET)) { + printk("ip_conntrack_core: Frag of proto %u.\n", + iph->protocol); + return 0; + } + + tuple->src.ip = iph->saddr; + tuple->dst.ip = iph->daddr; + tuple->dst.protonum = iph->protocol; + tuple->dst.dir = IP_CT_DIR_ORIGINAL; + + return protocol->pkt_to_tuple(skb, dataoff, tuple); +} + +int +ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse, + const struct ip_conntrack_tuple *orig, + const struct ip_conntrack_protocol *protocol) +{ + inverse->src.ip = orig->dst.ip; + inverse->dst.ip = orig->src.ip; + inverse->dst.protonum = orig->dst.protonum; + inverse->dst.dir = !orig->dst.dir; + + return protocol->invert_tuple(inverse, orig); +} + + +/* ip_conntrack_expect helper functions */ +static void destroy_expect(struct ip_conntrack_expect *exp) +{ + ip_conntrack_put(exp->master); + IP_NF_ASSERT(!timer_pending(&exp->timeout)); + kmem_cache_free(ip_conntrack_expect_cachep, exp); + CONNTRACK_STAT_INC(expect_delete); +} + +static void unlink_expect(struct ip_conntrack_expect *exp) +{ + MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); + list_del(&exp->list); + /* Logically in destroy_expect, but we hold the lock here. */ + exp->master->expecting--; +} + +static void expectation_timed_out(unsigned long ul_expect) +{ + struct ip_conntrack_expect *exp = (void *)ul_expect; + + WRITE_LOCK(&ip_conntrack_lock); + unlink_expect(exp); + WRITE_UNLOCK(&ip_conntrack_lock); + destroy_expect(exp); +} + +/* If an expectation for this connection is found, it gets delete from + * global list then returned. */ +static struct ip_conntrack_expect * +find_expectation(const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_expect *i; + + list_for_each_entry(i, &ip_conntrack_expect_list, list) { + /* If master is not in hash table yet (ie. packet hasn't left + this machine yet), how can other end know about expected? + Hence these are not the droids you are looking for (if + master ct never got confirmed, we'd hold a reference to it + and weird things would happen to future packets). */ + if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) + && is_confirmed(i->master) + && del_timer(&i->timeout)) { + unlink_expect(i); + return i; + } + } + return NULL; +} + +/* delete all expectations for this conntrack */ +static void remove_expectations(struct ip_conntrack *ct) +{ + struct ip_conntrack_expect *i, *tmp; + + /* Optimization: most connection never expect any others. */ + if (ct->expecting == 0) + return; + + list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { + if (i->master == ct && del_timer(&i->timeout)) { + unlink_expect(i); + destroy_expect(i); + } + } +} + +static void +clean_from_lists(struct ip_conntrack *ct) +{ + unsigned int ho, hr; + + DEBUGP("clean_from_lists(%p)\n", ct); + MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); + + ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); + LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); + + /* Destroy all pending expectations */ + remove_expectations(ct); +} + +static void +destroy_conntrack(struct nf_conntrack *nfct) +{ + struct ip_conntrack *ct = (struct ip_conntrack *)nfct; + struct ip_conntrack_protocol *proto; + + DEBUGP("destroy_conntrack(%p)\n", ct); + IP_NF_ASSERT(atomic_read(&nfct->use) == 0); + IP_NF_ASSERT(!timer_pending(&ct->timeout)); + + /* To make sure we don't get any weird locking issues here: + * destroy_conntrack() MUST NOT be called with a write lock + * to ip_conntrack_lock!!! -HW */ + proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); + if (proto && proto->destroy) + proto->destroy(ct); + + if (ip_conntrack_destroyed) + ip_conntrack_destroyed(ct); + + WRITE_LOCK(&ip_conntrack_lock); + /* Expectations will have been removed in clean_from_lists, + * except TFTP can create an expectation on the first packet, + * before connection is in the list, so we need to clean here, + * too. */ + remove_expectations(ct); + + /* We overload first tuple to link into unconfirmed list. */ + if (!is_confirmed(ct)) { + BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list)); + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + } + + CONNTRACK_STAT_INC(delete); + WRITE_UNLOCK(&ip_conntrack_lock); + + if (ct->master) + ip_conntrack_put(ct->master); + + DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); + kmem_cache_free(ip_conntrack_cachep, ct); + atomic_dec(&ip_conntrack_count); +} + +static void death_by_timeout(unsigned long ul_conntrack) +{ + struct ip_conntrack *ct = (void *)ul_conntrack; + + WRITE_LOCK(&ip_conntrack_lock); + /* Inside lock so preempt is disabled on module removal path. + * Otherwise we can get spurious warnings. */ + CONNTRACK_STAT_INC(delete_list); + clean_from_lists(ct); + WRITE_UNLOCK(&ip_conntrack_lock); + ip_conntrack_put(ct); +} + +static inline int +conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i, + const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + MUST_BE_READ_LOCKED(&ip_conntrack_lock); + return tuplehash_to_ctrack(i) != ignored_conntrack + && ip_ct_tuple_equal(tuple, &i->tuple); +} + +static struct ip_conntrack_tuple_hash * +__ip_conntrack_find(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + struct ip_conntrack_tuple_hash *h; + unsigned int hash = hash_conntrack(tuple); + + MUST_BE_READ_LOCKED(&ip_conntrack_lock); + list_for_each_entry(h, &ip_conntrack_hash[hash], list) { + if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { + CONNTRACK_STAT_INC(found); + return h; + } + CONNTRACK_STAT_INC(searched); + } + + return NULL; +} + +/* Find a connection corresponding to a tuple. */ +struct ip_conntrack_tuple_hash * +ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + struct ip_conntrack_tuple_hash *h; + + READ_LOCK(&ip_conntrack_lock); + h = __ip_conntrack_find(tuple, ignored_conntrack); + if (h) + atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); + READ_UNLOCK(&ip_conntrack_lock); + + return h; +} + +/* Confirm a connection given skb; places it in hash table */ +int +__ip_conntrack_confirm(struct sk_buff **pskb) +{ + unsigned int hash, repl_hash; + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + + ct = ip_conntrack_get(*pskb, &ctinfo); + + /* ipt_REJECT uses ip_conntrack_attach to attach related + ICMP/TCP RST packets in other direction. Actual packet + which created connection will be IP_CT_NEW or for an + expected connection, IP_CT_RELATED. */ + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + /* We're not in hash table, and we refuse to set up related + connections for unconfirmed conns. But packet copies and + REJECT will give spurious warnings here. */ + /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ + + /* No external references means noone else could have + confirmed us. */ + IP_NF_ASSERT(!is_confirmed(ct)); + DEBUGP("Confirming conntrack %p\n", ct); + + WRITE_LOCK(&ip_conntrack_lock); + + /* See if there's one in the list already, including reverse: + NAT could have grabbed it without realizing, since we're + not in the hash. If there is, we lost race. */ + if (!LIST_FIND(&ip_conntrack_hash[hash], + conntrack_tuple_cmp, + struct ip_conntrack_tuple_hash *, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL) + && !LIST_FIND(&ip_conntrack_hash[repl_hash], + conntrack_tuple_cmp, + struct ip_conntrack_tuple_hash *, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { + /* Remove from unconfirmed list */ + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + + list_prepend(&ip_conntrack_hash[hash], + &ct->tuplehash[IP_CT_DIR_ORIGINAL]); + list_prepend(&ip_conntrack_hash[repl_hash], + &ct->tuplehash[IP_CT_DIR_REPLY]); + /* Timer relative to confirmation time, not original + setting time, otherwise we'd get timer wrap in + weird delay cases. */ + ct->timeout.expires += jiffies; + add_timer(&ct->timeout); + atomic_inc(&ct->ct_general.use); + set_bit(IPS_CONFIRMED_BIT, &ct->status); + CONNTRACK_STAT_INC(insert); + WRITE_UNLOCK(&ip_conntrack_lock); + return NF_ACCEPT; + } + + CONNTRACK_STAT_INC(insert_failed); + WRITE_UNLOCK(&ip_conntrack_lock); + + return NF_DROP; +} + +/* Returns true if a connection correspondings to the tuple (required + for NAT). */ +int +ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + struct ip_conntrack_tuple_hash *h; + + READ_LOCK(&ip_conntrack_lock); + h = __ip_conntrack_find(tuple, ignored_conntrack); + READ_UNLOCK(&ip_conntrack_lock); + + return h != NULL; +} + +/* There's a small race here where we may free a just-assured + connection. Too bad: we're in trouble anyway. */ +static inline int unreplied(const struct ip_conntrack_tuple_hash *i) +{ + return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status)); +} + +static int early_drop(struct list_head *chain) +{ + /* Traverse backwards: gives us oldest, which is roughly LRU */ + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack *ct = NULL; + int dropped = 0; + + READ_LOCK(&ip_conntrack_lock); + h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *); + if (h) { + ct = tuplehash_to_ctrack(h); + atomic_inc(&ct->ct_general.use); + } + READ_UNLOCK(&ip_conntrack_lock); + + if (!ct) + return dropped; + + if (del_timer(&ct->timeout)) { + death_by_timeout((unsigned long)ct); + dropped = 1; + CONNTRACK_STAT_INC(early_drop); + } + ip_conntrack_put(ct); + return dropped; +} + +static inline int helper_cmp(const struct ip_conntrack_helper *i, + const struct ip_conntrack_tuple *rtuple) +{ + return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); +} + +static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple) +{ + return LIST_FIND(&helpers, helper_cmp, + struct ip_conntrack_helper *, + tuple); +} + +/* Allocate a new conntrack: we return -ENOMEM if classification + failed due to stress. Otherwise it really is unclassifiable. */ +static struct ip_conntrack_tuple_hash * +init_conntrack(const struct ip_conntrack_tuple *tuple, + struct ip_conntrack_protocol *protocol, + struct sk_buff *skb) +{ + struct ip_conntrack *conntrack; + struct ip_conntrack_tuple repl_tuple; + size_t hash; + struct ip_conntrack_expect *exp; + + if (!ip_conntrack_hash_rnd_initted) { + get_random_bytes(&ip_conntrack_hash_rnd, 4); + ip_conntrack_hash_rnd_initted = 1; + } + + hash = hash_conntrack(tuple); + + if (ip_conntrack_max + && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { + /* Try dropping from this hash chain. */ + if (!early_drop(&ip_conntrack_hash[hash])) { + if (net_ratelimit()) + printk(KERN_WARNING + "ip_conntrack: table full, dropping" + " packet.\n"); + return ERR_PTR(-ENOMEM); + } + } + + if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { + DEBUGP("Can't invert tuple.\n"); + return NULL; + } + + conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); + if (!conntrack) { + DEBUGP("Can't allocate conntrack.\n"); + return ERR_PTR(-ENOMEM); + } + + memset(conntrack, 0, sizeof(*conntrack)); + atomic_set(&conntrack->ct_general.use, 1); + conntrack->ct_general.destroy = destroy_conntrack; + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple; + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple; + if (!protocol->new(conntrack, skb)) { + kmem_cache_free(ip_conntrack_cachep, conntrack); + return NULL; + } + /* Don't set timer yet: wait for confirmation */ + init_timer(&conntrack->timeout); + conntrack->timeout.data = (unsigned long)conntrack; + conntrack->timeout.function = death_by_timeout; + + WRITE_LOCK(&ip_conntrack_lock); + exp = find_expectation(tuple); + + if (exp) { + DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n", + conntrack, exp); + /* Welcome, Mr. Bond. We've been expecting you... */ + __set_bit(IPS_EXPECTED_BIT, &conntrack->status); + conntrack->master = exp->master; +#if CONFIG_IP_NF_CONNTRACK_MARK + conntrack->mark = exp->master->mark; +#endif + nf_conntrack_get(&conntrack->master->ct_general); + CONNTRACK_STAT_INC(expect_new); + } else { + conntrack->helper = ip_ct_find_helper(&repl_tuple); + + CONNTRACK_STAT_INC(new); + } + + /* Overload tuple linked list to put us in unconfirmed list. */ + list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); + + atomic_inc(&ip_conntrack_count); + WRITE_UNLOCK(&ip_conntrack_lock); + + if (exp) { + if (exp->expectfn) + exp->expectfn(conntrack, exp); + destroy_expect(exp); + } + + return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; +} + +/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ +static inline struct ip_conntrack * +resolve_normal_ct(struct sk_buff *skb, + struct ip_conntrack_protocol *proto, + int *set_reply, + unsigned int hooknum, + enum ip_conntrack_info *ctinfo) +{ + struct ip_conntrack_tuple tuple; + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack *ct; + + IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0); + + if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, + &tuple,proto)) + return NULL; + + /* look for tuple match */ + h = ip_conntrack_find_get(&tuple, NULL); + if (!h) { + h = init_conntrack(&tuple, proto, skb); + if (!h) + return NULL; + if (IS_ERR(h)) + return (void *)h; + } + ct = tuplehash_to_ctrack(h); + + /* It exists; we have (non-exclusive) reference. */ + if (DIRECTION(h) == IP_CT_DIR_REPLY) { + *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; + /* Please set reply bit if this packet OK */ + *set_reply = 1; + } else { + /* Once we've had two way comms, always ESTABLISHED. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + DEBUGP("ip_conntrack_in: normal packet for %p\n", + ct); + *ctinfo = IP_CT_ESTABLISHED; + } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { + DEBUGP("ip_conntrack_in: related packet for %p\n", + ct); + *ctinfo = IP_CT_RELATED; + } else { + DEBUGP("ip_conntrack_in: new packet for %p\n", + ct); + *ctinfo = IP_CT_NEW; + } + *set_reply = 0; + } + skb->nfct = &ct->ct_general; + skb->nfctinfo = *ctinfo; + return ct; +} + +/* Netfilter hook itself. */ +unsigned int ip_conntrack_in(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + struct ip_conntrack_protocol *proto; + int set_reply; + int ret; + + /* Previously seen (loopback or untracked)? Ignore. */ + if ((*pskb)->nfct) { + CONNTRACK_STAT_INC(ignore); + return NF_ACCEPT; + } + + /* Never happen */ + if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) { + if (net_ratelimit()) { + printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n", + (*pskb)->nh.iph->protocol, hooknum); + } + return NF_DROP; + } + + /* FIXME: Do this right please. --RR */ + (*pskb)->nfcache |= NFC_UNKNOWN; + +/* Doesn't cover locally-generated broadcast, so not worth it. */ +#if 0 + /* Ignore broadcast: no `connection'. */ + if ((*pskb)->pkt_type == PACKET_BROADCAST) { + printk("Broadcast packet!\n"); + return NF_ACCEPT; + } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) + == htonl(0x000000FF)) { + printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n", + NIPQUAD((*pskb)->nh.iph->saddr), + NIPQUAD((*pskb)->nh.iph->daddr), + (*pskb)->sk, (*pskb)->pkt_type); + } +#endif + + proto = ip_ct_find_proto((*pskb)->nh.iph->protocol); + + /* It may be an special packet, error, unclean... + * inverse of the return code tells to the netfilter + * core what to do with the packet. */ + if (proto->error != NULL + && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) { + CONNTRACK_STAT_INC(error); + CONNTRACK_STAT_INC(invalid); + return -ret; + } + + if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) { + /* Not valid part of a connection */ + CONNTRACK_STAT_INC(invalid); + return NF_ACCEPT; + } + + if (IS_ERR(ct)) { + /* Too stressed to deal. */ + CONNTRACK_STAT_INC(drop); + return NF_DROP; + } + + IP_NF_ASSERT((*pskb)->nfct); + + ret = proto->packet(ct, *pskb, ctinfo); + if (ret < 0) { + /* Invalid: inverse of the return code tells + * the netfilter core what to do*/ + nf_conntrack_put((*pskb)->nfct); + (*pskb)->nfct = NULL; + CONNTRACK_STAT_INC(invalid); + return -ret; + } + + if (set_reply) + set_bit(IPS_SEEN_REPLY_BIT, &ct->status); + + return ret; +} + +int invert_tuplepr(struct ip_conntrack_tuple *inverse, + const struct ip_conntrack_tuple *orig) +{ + return ip_ct_invert_tuple(inverse, orig, + ip_ct_find_proto(orig->dst.protonum)); +} + +/* Would two expected things clash? */ +static inline int expect_clash(const struct ip_conntrack_expect *a, + const struct ip_conntrack_expect *b) +{ + /* Part covered by intersection of masks must be unequal, + otherwise they clash */ + struct ip_conntrack_tuple intersect_mask + = { { a->mask.src.ip & b->mask.src.ip, + { a->mask.src.u.all & b->mask.src.u.all } }, + { a->mask.dst.ip & b->mask.dst.ip, + { a->mask.dst.u.all & b->mask.dst.u.all }, + a->mask.dst.protonum & b->mask.dst.protonum } }; + + return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); +} + +static inline int expect_matches(const struct ip_conntrack_expect *a, + const struct ip_conntrack_expect *b) +{ + return a->master == b->master + && ip_ct_tuple_equal(&a->tuple, &b->tuple) + && ip_ct_tuple_equal(&a->mask, &b->mask); +} + +/* Generally a bad idea to call this: could have matched already. */ +void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp) +{ + struct ip_conntrack_expect *i; + + WRITE_LOCK(&ip_conntrack_lock); + /* choose the the oldest expectation to evict */ + list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { + if (expect_matches(i, exp) && del_timer(&i->timeout)) { + unlink_expect(i); + WRITE_UNLOCK(&ip_conntrack_lock); + destroy_expect(i); + return; + } + } + WRITE_UNLOCK(&ip_conntrack_lock); +} + +struct ip_conntrack_expect *ip_conntrack_expect_alloc(void) +{ + struct ip_conntrack_expect *new; + + new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC); + if (!new) { + DEBUGP("expect_related: OOM allocating expect\n"); + return NULL; + } + new->master = NULL; + return new; +} + +void ip_conntrack_expect_free(struct ip_conntrack_expect *expect) +{ + kmem_cache_free(ip_conntrack_expect_cachep, expect); +} + +static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) +{ + atomic_inc(&exp->master->ct_general.use); + exp->master->expecting++; + list_add(&exp->list, &ip_conntrack_expect_list); + + if (exp->master->helper->timeout) { + init_timer(&exp->timeout); + exp->timeout.data = (unsigned long)exp; + exp->timeout.function = expectation_timed_out; + exp->timeout.expires + = jiffies + exp->master->helper->timeout * HZ; + add_timer(&exp->timeout); + } else + exp->timeout.function = NULL; + + CONNTRACK_STAT_INC(expect_create); +} + +/* Race with expectations being used means we could have none to find; OK. */ +static void evict_oldest_expect(struct ip_conntrack *master) +{ + struct ip_conntrack_expect *i; + + list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { + if (i->master == master) { + if (del_timer(&i->timeout)) { + unlink_expect(i); + destroy_expect(i); + } + break; + } + } +} + +static inline int refresh_timer(struct ip_conntrack_expect *i) +{ + if (!del_timer(&i->timeout)) + return 0; + + i->timeout.expires = jiffies + i->master->helper->timeout*HZ; + add_timer(&i->timeout); + return 1; +} + +int ip_conntrack_expect_related(struct ip_conntrack_expect *expect) +{ + struct ip_conntrack_expect *i; + int ret; + + DEBUGP("ip_conntrack_expect_related %p\n", related_to); + DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); + DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); + + WRITE_LOCK(&ip_conntrack_lock); + list_for_each_entry(i, &ip_conntrack_expect_list, list) { + if (expect_matches(i, expect)) { + /* Refresh timer: if it's dying, ignore.. */ + if (refresh_timer(i)) { + ret = 0; + /* We don't need the one they've given us. */ + ip_conntrack_expect_free(expect); + goto out; + } + } else if (expect_clash(i, expect)) { + ret = -EBUSY; + goto out; + } + } + + /* Will be over limit? */ + if (expect->master->helper->max_expected && + expect->master->expecting >= expect->master->helper->max_expected) + evict_oldest_expect(expect->master); + + ip_conntrack_expect_insert(expect); + ret = 0; +out: + WRITE_UNLOCK(&ip_conntrack_lock); + return ret; +} + +/* Alter reply tuple (maybe alter helper). This is for NAT, and is + implicitly racy: see __ip_conntrack_confirm */ +void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, + const struct ip_conntrack_tuple *newreply) +{ + WRITE_LOCK(&ip_conntrack_lock); + /* Should be unconfirmed, so not in hash table yet */ + IP_NF_ASSERT(!is_confirmed(conntrack)); + + DEBUGP("Altering reply tuple of %p to ", conntrack); + DUMP_TUPLE(newreply); + + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; + if (!conntrack->master && conntrack->expecting == 0) + conntrack->helper = ip_ct_find_helper(newreply); + WRITE_UNLOCK(&ip_conntrack_lock); +} + +int ip_conntrack_helper_register(struct ip_conntrack_helper *me) +{ + BUG_ON(me->timeout == 0); + WRITE_LOCK(&ip_conntrack_lock); + list_prepend(&helpers, me); + WRITE_UNLOCK(&ip_conntrack_lock); + + return 0; +} + +static inline int unhelp(struct ip_conntrack_tuple_hash *i, + const struct ip_conntrack_helper *me) +{ + if (tuplehash_to_ctrack(i)->helper == me) + tuplehash_to_ctrack(i)->helper = NULL; + return 0; +} + +void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) +{ + unsigned int i; + struct ip_conntrack_expect *exp, *tmp; + + /* Need write lock here, to delete helper. */ + WRITE_LOCK(&ip_conntrack_lock); + LIST_DELETE(&helpers, me); + + /* Get rid of expectations */ + list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { + if (exp->master->helper == me && del_timer(&exp->timeout)) { + unlink_expect(exp); + destroy_expect(exp); + } + } + /* Get rid of expecteds, set helpers to NULL. */ + LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me); + for (i = 0; i < ip_conntrack_htable_size; i++) + LIST_FIND_W(&ip_conntrack_hash[i], unhelp, + struct ip_conntrack_tuple_hash *, me); + WRITE_UNLOCK(&ip_conntrack_lock); + + /* Someone could be still looking at the helper in a bh. */ + synchronize_net(); +} + +static inline void ct_add_counters(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb) +{ +#ifdef CONFIG_IP_NF_CT_ACCT + if (skb) { + ct->counters[CTINFO2DIR(ctinfo)].packets++; + ct->counters[CTINFO2DIR(ctinfo)].bytes += + ntohs(skb->nh.iph->tot_len); + } +#endif +} + +/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */ +void ip_ct_refresh_acct(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + unsigned long extra_jiffies) +{ + IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct); + + /* If not in hash table, timer will not be active yet */ + if (!is_confirmed(ct)) { + ct->timeout.expires = extra_jiffies; + ct_add_counters(ct, ctinfo, skb); + } else { + WRITE_LOCK(&ip_conntrack_lock); + /* Need del_timer for race avoidance (may already be dying). */ + if (del_timer(&ct->timeout)) { + ct->timeout.expires = jiffies + extra_jiffies; + add_timer(&ct->timeout); + } + ct_add_counters(ct, ctinfo, skb); + WRITE_UNLOCK(&ip_conntrack_lock); + } +} + +/* Returns new sk_buff, or NULL */ +struct sk_buff * +ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) +{ + struct sock *sk = skb->sk; +#ifdef CONFIG_NETFILTER_DEBUG + unsigned int olddebug = skb->nf_debug; +#endif + + if (sk) { + sock_hold(sk); + skb_orphan(skb); + } + + local_bh_disable(); + skb = ip_defrag(skb, user); + local_bh_enable(); + + if (!skb) { + if (sk) + sock_put(sk); + return skb; + } + + if (sk) { + skb_set_owner_w(skb, sk); + sock_put(sk); + } + + ip_send_check(skb->nh.iph); + skb->nfcache |= NFC_ALTERED; +#ifdef CONFIG_NETFILTER_DEBUG + /* Packet path as if nothing had happened. */ + skb->nf_debug = olddebug; +#endif + return skb; +} + +/* Used by ipt_REJECT. */ +static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + + /* This ICMP is in reverse direction to the packet which caused it */ + ct = ip_conntrack_get(skb, &ctinfo); + + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) + ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; + else + ctinfo = IP_CT_RELATED; + + /* Attach to new skbuff, and increment count */ + nskb->nfct = &ct->ct_general; + nskb->nfctinfo = ctinfo; + nf_conntrack_get(nskb->nfct); +} + +static inline int +do_iter(const struct ip_conntrack_tuple_hash *i, + int (*iter)(struct ip_conntrack *i, void *data), + void *data) +{ + return iter(tuplehash_to_ctrack(i), data); +} + +/* Bring out ya dead! */ +static struct ip_conntrack_tuple_hash * +get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data), + void *data, unsigned int *bucket) +{ + struct ip_conntrack_tuple_hash *h = NULL; + + WRITE_LOCK(&ip_conntrack_lock); + for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { + h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter, + struct ip_conntrack_tuple_hash *, iter, data); + if (h) + break; + } + if (!h) + h = LIST_FIND_W(&unconfirmed, do_iter, + struct ip_conntrack_tuple_hash *, iter, data); + if (h) + atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); + WRITE_UNLOCK(&ip_conntrack_lock); + + return h; +} + +void +ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data) +{ + struct ip_conntrack_tuple_hash *h; + unsigned int bucket = 0; + + while ((h = get_next_corpse(iter, data, &bucket)) != NULL) { + struct ip_conntrack *ct = tuplehash_to_ctrack(h); + /* Time to push up daises... */ + if (del_timer(&ct->timeout)) + death_by_timeout((unsigned long)ct); + /* ... else the timer will get him soon. */ + + ip_conntrack_put(ct); + } +} + +/* Fast function for those who don't want to parse /proc (and I don't + blame them). */ +/* Reversing the socket's dst/src point of view gives us the reply + mapping. */ +static int +getorigdst(struct sock *sk, int optval, void __user *user, int *len) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack_tuple tuple; + + IP_CT_TUPLE_U_BLANK(&tuple); + tuple.src.ip = inet->rcv_saddr; + tuple.src.u.tcp.port = inet->sport; + tuple.dst.ip = inet->daddr; + tuple.dst.u.tcp.port = inet->dport; + tuple.dst.protonum = IPPROTO_TCP; + + /* We only do TCP at the moment: is there a better way? */ + if (strcmp(sk->sk_prot->name, "TCP")) { + DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n"); + return -ENOPROTOOPT; + } + + if ((unsigned int) *len < sizeof(struct sockaddr_in)) { + DEBUGP("SO_ORIGINAL_DST: len %u not %u\n", + *len, sizeof(struct sockaddr_in)); + return -EINVAL; + } + + h = ip_conntrack_find_get(&tuple, NULL); + if (h) { + struct sockaddr_in sin; + struct ip_conntrack *ct = tuplehash_to_ctrack(h); + + sin.sin_family = AF_INET; + sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u.tcp.port; + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.ip; + + DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + ip_conntrack_put(ct); + if (copy_to_user(user, &sin, sizeof(sin)) != 0) + return -EFAULT; + else + return 0; + } + DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n", + NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port), + NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; +} + +static struct nf_sockopt_ops so_getorigdst = { + .pf = PF_INET, + .get_optmin = SO_ORIGINAL_DST, + .get_optmax = SO_ORIGINAL_DST+1, + .get = &getorigdst, +}; + +static int kill_all(struct ip_conntrack *i, void *data) +{ + return 1; +} + +static void free_conntrack_hash(void) +{ + if (ip_conntrack_vmalloc) + vfree(ip_conntrack_hash); + else + free_pages((unsigned long)ip_conntrack_hash, + get_order(sizeof(struct list_head) + * ip_conntrack_htable_size)); +} + +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void ip_conntrack_cleanup(void) +{ + ip_ct_attach = NULL; + /* This makes sure all current packets have passed through + netfilter framework. Roll on, two-stage module + delete... */ + synchronize_net(); + + i_see_dead_people: + ip_ct_iterate_cleanup(kill_all, NULL); + if (atomic_read(&ip_conntrack_count) != 0) { + schedule(); + goto i_see_dead_people; + } + + kmem_cache_destroy(ip_conntrack_cachep); + kmem_cache_destroy(ip_conntrack_expect_cachep); + free_conntrack_hash(); + nf_unregister_sockopt(&so_getorigdst); +} + +static int hashsize; +module_param(hashsize, int, 0400); + +int __init ip_conntrack_init(void) +{ + unsigned int i; + int ret; + + /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB + * machine has 256 buckets. >= 1GB machines have 8192 buckets. */ + if (hashsize) { + ip_conntrack_htable_size = hashsize; + } else { + ip_conntrack_htable_size + = (((num_physpages << PAGE_SHIFT) / 16384) + / sizeof(struct list_head)); + if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + ip_conntrack_htable_size = 8192; + if (ip_conntrack_htable_size < 16) + ip_conntrack_htable_size = 16; + } + ip_conntrack_max = 8 * ip_conntrack_htable_size; + + printk("ip_conntrack version %s (%u buckets, %d max)" + " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION, + ip_conntrack_htable_size, ip_conntrack_max, + sizeof(struct ip_conntrack)); + + ret = nf_register_sockopt(&so_getorigdst); + if (ret != 0) { + printk(KERN_ERR "Unable to register netfilter socket option\n"); + return ret; + } + + /* AK: the hash table is twice as big than needed because it + uses list_head. it would be much nicer to caches to use a + single pointer list head here. */ + ip_conntrack_vmalloc = 0; + ip_conntrack_hash + =(void*)__get_free_pages(GFP_KERNEL, + get_order(sizeof(struct list_head) + *ip_conntrack_htable_size)); + if (!ip_conntrack_hash) { + ip_conntrack_vmalloc = 1; + printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n"); + ip_conntrack_hash = vmalloc(sizeof(struct list_head) + * ip_conntrack_htable_size); + } + if (!ip_conntrack_hash) { + printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); + goto err_unreg_sockopt; + } + + ip_conntrack_cachep = kmem_cache_create("ip_conntrack", + sizeof(struct ip_conntrack), 0, + 0, NULL, NULL); + if (!ip_conntrack_cachep) { + printk(KERN_ERR "Unable to create ip_conntrack slab cache\n"); + goto err_free_hash; + } + + ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect", + sizeof(struct ip_conntrack_expect), + 0, 0, NULL, NULL); + if (!ip_conntrack_expect_cachep) { + printk(KERN_ERR "Unable to create ip_expect slab cache\n"); + goto err_free_conntrack_slab; + } + + /* Don't NEED lock here, but good form anyway. */ + WRITE_LOCK(&ip_conntrack_lock); + for (i = 0; i < MAX_IP_CT_PROTO; i++) + ip_ct_protos[i] = &ip_conntrack_generic_protocol; + /* Sew in builtin protocols. */ + ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; + ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; + ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; + WRITE_UNLOCK(&ip_conntrack_lock); + + for (i = 0; i < ip_conntrack_htable_size; i++) + INIT_LIST_HEAD(&ip_conntrack_hash[i]); + + /* For use by ipt_REJECT */ + ip_ct_attach = ip_conntrack_attach; + + /* Set up fake conntrack: + - to never be deleted, not in any hashes */ + atomic_set(&ip_conntrack_untracked.ct_general.use, 1); + /* - and look it like as a confirmed connection */ + set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status); + + return ret; + +err_free_conntrack_slab: + kmem_cache_destroy(ip_conntrack_cachep); +err_free_hash: + free_conntrack_hash(); +err_unreg_sockopt: + nf_unregister_sockopt(&so_getorigdst); + + return -ENOMEM; +} diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c new file mode 100644 index 000000000000..12b88cbb11db --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -0,0 +1,501 @@ +/* FTP extension for IP connection tracking. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/ctype.h> +#include <net/checksum.h> +#include <net/tcp.h> + +#include <linux/netfilter_ipv4/lockhelp.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_ftp.h> +#include <linux/moduleparam.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); +MODULE_DESCRIPTION("ftp connection tracking helper"); + +/* This is slow, but it's simple. --RR */ +static char ftp_buffer[65536]; + +static DECLARE_LOCK(ip_ftp_lock); + +#define MAX_PORTS 8 +static int ports[MAX_PORTS]; +static int ports_c; +module_param_array(ports, int, &ports_c, 0400); + +static int loose; +module_param(loose, int, 0600); + +unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + enum ip_ct_ftp_type type, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp, + u32 *seq); +EXPORT_SYMBOL_GPL(ip_nat_ftp_hook); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int try_rfc959(const char *, size_t, u_int32_t [], char); +static int try_eprt(const char *, size_t, u_int32_t [], char); +static int try_epsv_response(const char *, size_t, u_int32_t [], char); + +static struct ftp_search { + enum ip_conntrack_dir dir; + const char *pattern; + size_t plen; + char skip; + char term; + enum ip_ct_ftp_type ftptype; + int (*getnum)(const char *, size_t, u_int32_t[], char); +} search[] = { + { + IP_CT_DIR_ORIGINAL, + "PORT", sizeof("PORT") - 1, ' ', '\r', + IP_CT_FTP_PORT, + try_rfc959, + }, + { + IP_CT_DIR_REPLY, + "227 ", sizeof("227 ") - 1, '(', ')', + IP_CT_FTP_PASV, + try_rfc959, + }, + { + IP_CT_DIR_ORIGINAL, + "EPRT", sizeof("EPRT") - 1, ' ', '\r', + IP_CT_FTP_EPRT, + try_eprt, + }, + { + IP_CT_DIR_REPLY, + "229 ", sizeof("229 ") - 1, '(', ')', + IP_CT_FTP_EPSV, + try_epsv_response, + }, +}; + +static int try_number(const char *data, size_t dlen, u_int32_t array[], + int array_size, char sep, char term) +{ + u_int32_t i, len; + + memset(array, 0, sizeof(array[0])*array_size); + + /* Keep data pointing at next char. */ + for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) { + if (*data >= '0' && *data <= '9') { + array[i] = array[i]*10 + *data - '0'; + } + else if (*data == sep) + i++; + else { + /* Unexpected character; true if it's the + terminator and we're finished. */ + if (*data == term && i == array_size - 1) + return len; + + DEBUGP("Char %u (got %u nums) `%u' unexpected\n", + len, i, *data); + return 0; + } + } + DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep); + + return 0; +} + +/* Returns 0, or length of numbers: 192,168,1,1,5,6 */ +static int try_rfc959(const char *data, size_t dlen, u_int32_t array[6], + char term) +{ + return try_number(data, dlen, array, 6, ',', term); +} + +/* Grab port: number up to delimiter */ +static int get_port(const char *data, int start, size_t dlen, char delim, + u_int32_t array[2]) +{ + u_int16_t port = 0; + int i; + + for (i = start; i < dlen; i++) { + /* Finished? */ + if (data[i] == delim) { + if (port == 0) + break; + array[0] = port >> 8; + array[1] = port; + return i + 1; + } + else if (data[i] >= '0' && data[i] <= '9') + port = port*10 + data[i] - '0'; + else /* Some other crap */ + break; + } + return 0; +} + +/* Returns 0, or length of numbers: |1|132.235.1.2|6275| */ +static int try_eprt(const char *data, size_t dlen, u_int32_t array[6], + char term) +{ + char delim; + int length; + + /* First character is delimiter, then "1" for IPv4, then + delimiter again. */ + if (dlen <= 3) return 0; + delim = data[0]; + if (isdigit(delim) || delim < 33 || delim > 126 + || data[1] != '1' || data[2] != delim) + return 0; + + DEBUGP("EPRT: Got |1|!\n"); + /* Now we have IP address. */ + length = try_number(data + 3, dlen - 3, array, 4, '.', delim); + if (length == 0) + return 0; + + DEBUGP("EPRT: Got IP address!\n"); + /* Start offset includes initial "|1|", and trailing delimiter */ + return get_port(data, 3 + length + 1, dlen, delim, array+4); +} + +/* Returns 0, or length of numbers: |||6446| */ +static int try_epsv_response(const char *data, size_t dlen, u_int32_t array[6], + char term) +{ + char delim; + + /* Three delimiters. */ + if (dlen <= 3) return 0; + delim = data[0]; + if (isdigit(delim) || delim < 33 || delim > 126 + || data[1] != delim || data[2] != delim) + return 0; + + return get_port(data, 3, dlen, delim, array+4); +} + +/* Return 1 for match, 0 for accept, -1 for partial. */ +static int find_pattern(const char *data, size_t dlen, + const char *pattern, size_t plen, + char skip, char term, + unsigned int *numoff, + unsigned int *numlen, + u_int32_t array[6], + int (*getnum)(const char *, size_t, u_int32_t[], char)) +{ + size_t i; + + DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen); + if (dlen == 0) + return 0; + + if (dlen <= plen) { + /* Short packet: try for partial? */ + if (strnicmp(data, pattern, dlen) == 0) + return -1; + else return 0; + } + + if (strnicmp(data, pattern, plen) != 0) { +#if 0 + size_t i; + + DEBUGP("ftp: string mismatch\n"); + for (i = 0; i < plen; i++) { + DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n", + i, data[i], data[i], + pattern[i], pattern[i]); + } +#endif + return 0; + } + + DEBUGP("Pattern matches!\n"); + /* Now we've found the constant string, try to skip + to the 'skip' character */ + for (i = plen; data[i] != skip; i++) + if (i == dlen - 1) return -1; + + /* Skip over the last character */ + i++; + + DEBUGP("Skipped up to `%c'!\n", skip); + + *numoff = i; + *numlen = getnum(data + i, dlen - i, array, term); + if (!*numlen) + return -1; + + DEBUGP("Match succeeded!\n"); + return 1; +} + +/* Look up to see if we're just after a \n. */ +static int find_nl_seq(u16 seq, const struct ip_ct_ftp_master *info, int dir) +{ + unsigned int i; + + for (i = 0; i < info->seq_aft_nl_num[dir]; i++) + if (info->seq_aft_nl[dir][i] == seq) + return 1; + return 0; +} + +/* We don't update if it's older than what we have. */ +static void update_nl_seq(u16 nl_seq, struct ip_ct_ftp_master *info, int dir) +{ + unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; + + /* Look for oldest: if we find exact match, we're done. */ + for (i = 0; i < info->seq_aft_nl_num[dir]; i++) { + if (info->seq_aft_nl[dir][i] == nl_seq) + return; + + if (oldest == info->seq_aft_nl_num[dir] + || before(info->seq_aft_nl[dir][i], oldest)) + oldest = i; + } + + if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) + info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; + else if (oldest != NUM_SEQ_TO_REMEMBER) + info->seq_aft_nl[dir][oldest] = nl_seq; +} + +static int help(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff, datalen; + struct tcphdr _tcph, *th; + char *fb_ptr; + int ret; + u32 seq, array[6] = { 0 }; + int dir = CTINFO2DIR(ctinfo); + unsigned int matchlen, matchoff; + struct ip_ct_ftp_master *ct_ftp_info = &ct->help.ct_ftp_info; + struct ip_conntrack_expect *exp; + unsigned int i; + int found = 0, ends_in_nl; + + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED + && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) { + DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo); + return NF_ACCEPT; + } + + th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) + return NF_ACCEPT; + + dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4; + /* No data? */ + if (dataoff >= (*pskb)->len) { + DEBUGP("ftp: pskblen = %u\n", (*pskb)->len); + return NF_ACCEPT; + } + datalen = (*pskb)->len - dataoff; + + LOCK_BH(&ip_ftp_lock); + fb_ptr = skb_header_pointer(*pskb, dataoff, + (*pskb)->len - dataoff, ftp_buffer); + BUG_ON(fb_ptr == NULL); + + ends_in_nl = (fb_ptr[datalen - 1] == '\n'); + seq = ntohl(th->seq) + datalen; + + /* Look up to see if we're just after a \n. */ + if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) { + /* Now if this ends in \n, update ftp info. */ + DEBUGP("ip_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n", + ct_ftp_info->seq_aft_nl[0][dir] + old_seq_aft_nl_set ? "":"(UNSET) ", old_seq_aft_nl); + ret = NF_ACCEPT; + goto out_update_nl; + } + + /* Initialize IP array to expected address (it's not mentioned + in EPSV responses) */ + array[0] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 24) & 0xFF; + array[1] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 16) & 0xFF; + array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF; + array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF; + + for (i = 0; i < ARRAY_SIZE(search); i++) { + if (search[i].dir != dir) continue; + + found = find_pattern(fb_ptr, (*pskb)->len - dataoff, + search[i].pattern, + search[i].plen, + search[i].skip, + search[i].term, + &matchoff, &matchlen, + array, + search[i].getnum); + if (found) break; + } + if (found == -1) { + /* We don't usually drop packets. After all, this is + connection tracking, not packet filtering. + However, it is necessary for accurate tracking in + this case. */ + if (net_ratelimit()) + printk("conntrack_ftp: partial %s %u+%u\n", + search[i].pattern, + ntohl(th->seq), datalen); + ret = NF_DROP; + goto out; + } else if (found == 0) { /* No match */ + ret = NF_ACCEPT; + goto out_update_nl; + } + + DEBUGP("conntrack_ftp: match `%s' (%u bytes at %u)\n", + fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff); + + /* Allocate expectation which will be inserted */ + exp = ip_conntrack_expect_alloc(); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + + /* We refer to the reverse direction ("!dir") tuples here, + * because we're expecting something in the other direction. + * Doesn't matter unless NAT is happening. */ + exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; + + if (htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3]) + != ct->tuplehash[dir].tuple.src.ip) { + /* Enrico Scholz's passive FTP to partially RNAT'd ftp + server: it really wants us to connect to a + different IP address. Simply don't record it for + NAT. */ + DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n", + array[0], array[1], array[2], array[3], + NIPQUAD(ct->tuplehash[dir].tuple.src.ip)); + + /* Thanks to Cristiano Lincoln Mattos + <lincoln@cesar.org.br> for reporting this potential + problem (DMZ machines opening holes to internal + networks, or the packet filter itself). */ + if (!loose) { + ret = NF_ACCEPT; + ip_conntrack_expect_free(exp); + goto out_update_nl; + } + exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16) + | (array[2] << 8) | array[3]); + } + + exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; + exp->tuple.dst.u.tcp.port = htons(array[4] << 8 | array[5]); + exp->tuple.src.u.tcp.port = 0; /* Don't care. */ + exp->tuple.dst.protonum = IPPROTO_TCP; + exp->mask = ((struct ip_conntrack_tuple) + { { 0xFFFFFFFF, { 0 } }, + { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }}); + + exp->expectfn = NULL; + exp->master = ct; + + /* Now, NAT might want to mangle the packet, and register the + * (possibly changed) expectation itself. */ + if (ip_nat_ftp_hook) + ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, + matchoff, matchlen, exp, &seq); + else { + /* Can't expect this? Best to drop packet now. */ + if (ip_conntrack_expect_related(exp) != 0) { + ip_conntrack_expect_free(exp); + ret = NF_DROP; + } else + ret = NF_ACCEPT; + } + +out_update_nl: + /* Now if this ends in \n, update ftp info. Seq may have been + * adjusted by NAT code. */ + if (ends_in_nl) + update_nl_seq(seq, ct_ftp_info,dir); + out: + UNLOCK_BH(&ip_ftp_lock); + return ret; +} + +static struct ip_conntrack_helper ftp[MAX_PORTS]; +static char ftp_names[MAX_PORTS][10]; + +/* Not __exit: called from init() */ +static void fini(void) +{ + int i; + for (i = 0; i < ports_c; i++) { + DEBUGP("ip_ct_ftp: unregistering helper for port %d\n", + ports[i]); + ip_conntrack_helper_unregister(&ftp[i]); + } +} + +static int __init init(void) +{ + int i, ret; + char *tmpname; + + if (ports_c == 0) + ports[ports_c++] = FTP_PORT; + + for (i = 0; i < ports_c; i++) { + ftp[i].tuple.src.u.tcp.port = htons(ports[i]); + ftp[i].tuple.dst.protonum = IPPROTO_TCP; + ftp[i].mask.src.u.tcp.port = 0xFFFF; + ftp[i].mask.dst.protonum = 0xFF; + ftp[i].max_expected = 1; + ftp[i].timeout = 5 * 60; /* 5 minutes */ + ftp[i].me = THIS_MODULE; + ftp[i].help = help; + + tmpname = &ftp_names[i][0]; + if (ports[i] == FTP_PORT) + sprintf(tmpname, "ftp"); + else + sprintf(tmpname, "ftp-%d", ports[i]); + ftp[i].name = tmpname; + + DEBUGP("ip_ct_ftp: registering helper for port %d\n", + ports[i]); + ret = ip_conntrack_helper_register(&ftp[i]); + + if (ret) { + fini(); + return ret; + } + } + return 0; +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c new file mode 100644 index 000000000000..33cc7348b6ee --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_irc.c @@ -0,0 +1,313 @@ +/* IRC extension for IP connection tracking, Version 1.21 + * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org> + * based on RR's ip_conntrack_ftp.c + * + * ip_conntrack_irc.c,v 1.21 2002/02/05 14:49:26 laforge Exp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + ** + * Module load syntax: + * insmod ip_conntrack_irc.o ports=port1,port2,...port<MAX_PORTS> + * max_dcc_channels=n dcc_timeout=secs + * + * please give the ports of all IRC servers You wish to connect to. + * If You don't specify ports, the default will be port 6667. + * With max_dcc_channels you can define the maximum number of not + * yet answered DCC channels per IRC session (default 8). + * With dcc_timeout you can specify how long the system waits for + * an expected DCC channel (default 300 seconds). + * + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <net/checksum.h> +#include <net/tcp.h> + +#include <linux/netfilter_ipv4/lockhelp.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_irc.h> +#include <linux/moduleparam.h> + +#define MAX_PORTS 8 +static int ports[MAX_PORTS]; +static int ports_c; +static int max_dcc_channels = 8; +static unsigned int dcc_timeout = 300; +/* This is slow, but it's simple. --RR */ +static char irc_buffer[65536]; +static DECLARE_LOCK(irc_buffer_lock); + +unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp); +EXPORT_SYMBOL_GPL(ip_nat_irc_hook); + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); +MODULE_LICENSE("GPL"); +module_param_array(ports, int, &ports_c, 0400); +MODULE_PARM_DESC(ports, "port numbers of IRC servers"); +module_param(max_dcc_channels, int, 0400); +MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session"); +module_param(dcc_timeout, int, 0400); +MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels"); + +static char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " }; +#define MINMATCHLEN 5 + +#if 0 +#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s:" format, \ + __FILE__, __FUNCTION__ , ## args) +#else +#define DEBUGP(format, args...) +#endif + +static int parse_dcc(char *data, char *data_end, u_int32_t *ip, + u_int16_t *port, char **ad_beg_p, char **ad_end_p) +/* tries to get the ip_addr and port out of a dcc command + return value: -1 on failure, 0 on success + data pointer to first byte of DCC command data + data_end pointer to last byte of dcc command data + ip returns parsed ip of dcc command + port returns parsed port of dcc command + ad_beg_p returns pointer to first byte of addr data + ad_end_p returns pointer to last byte of addr data */ +{ + + /* at least 12: "AAAAAAAA P\1\n" */ + while (*data++ != ' ') + if (data > data_end - 12) + return -1; + + *ad_beg_p = data; + *ip = simple_strtoul(data, &data, 10); + + /* skip blanks between ip and port */ + while (*data == ' ') { + if (data >= data_end) + return -1; + data++; + } + + *port = simple_strtoul(data, &data, 10); + *ad_end_p = data; + + return 0; +} + +static int help(struct sk_buff **pskb, + struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff; + struct tcphdr _tcph, *th; + char *data, *data_limit, *ib_ptr; + int dir = CTINFO2DIR(ctinfo); + struct ip_conntrack_expect *exp; + u32 seq; + u_int32_t dcc_ip; + u_int16_t dcc_port; + int i, ret = NF_ACCEPT; + char *addr_beg_p, *addr_end_p; + + DEBUGP("entered\n"); + + /* If packet is coming from IRC server */ + if (dir == IP_CT_DIR_REPLY) + return NF_ACCEPT; + + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED + && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) { + DEBUGP("Conntrackinfo = %u\n", ctinfo); + return NF_ACCEPT; + } + + /* Not a full tcp header? */ + th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) + return NF_ACCEPT; + + /* No data? */ + dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4; + if (dataoff >= (*pskb)->len) + return NF_ACCEPT; + + LOCK_BH(&irc_buffer_lock); + ib_ptr = skb_header_pointer(*pskb, dataoff, + (*pskb)->len - dataoff, irc_buffer); + BUG_ON(ib_ptr == NULL); + + data = ib_ptr; + data_limit = ib_ptr + (*pskb)->len - dataoff; + + /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24 + * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */ + while (data < (data_limit - (19 + MINMATCHLEN))) { + if (memcmp(data, "\1DCC ", 5)) { + data++; + continue; + } + + data += 5; + /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */ + + DEBUGP("DCC found in master %u.%u.%u.%u:%u %u.%u.%u.%u:%u...\n", + NIPQUAD(iph->saddr), ntohs(th->source), + NIPQUAD(iph->daddr), ntohs(th->dest)); + + for (i = 0; i < ARRAY_SIZE(dccprotos); i++) { + if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) { + /* no match */ + continue; + } + + DEBUGP("DCC %s detected\n", dccprotos[i]); + data += strlen(dccprotos[i]); + /* we have at least + * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid + * data left (== 14/13 bytes) */ + if (parse_dcc((char *)data, data_limit, &dcc_ip, + &dcc_port, &addr_beg_p, &addr_end_p)) { + /* unable to parse */ + DEBUGP("unable to parse dcc command\n"); + continue; + } + DEBUGP("DCC bound ip/port: %u.%u.%u.%u:%u\n", + HIPQUAD(dcc_ip), dcc_port); + + /* dcc_ip can be the internal OR external (NAT'ed) IP + * Tiago Sousa <mirage@kaotik.org> */ + if (ct->tuplehash[dir].tuple.src.ip != htonl(dcc_ip) + && ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip != htonl(dcc_ip)) { + if (net_ratelimit()) + printk(KERN_WARNING + "Forged DCC command from " + "%u.%u.%u.%u: %u.%u.%u.%u:%u\n", + NIPQUAD(ct->tuplehash[dir].tuple.src.ip), + HIPQUAD(dcc_ip), dcc_port); + + continue; + } + + exp = ip_conntrack_expect_alloc(); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + + /* save position of address in dcc string, + * necessary for NAT */ + DEBUGP("tcph->seq = %u\n", th->seq); + seq = ntohl(th->seq) + (addr_beg_p - ib_ptr); + + /* We refer to the reverse direction ("!dir") + * tuples here, because we're expecting + * something in the other * direction. + * Doesn't matter unless NAT is happening. */ + exp->tuple = ((struct ip_conntrack_tuple) + { { 0, { 0 } }, + { ct->tuplehash[!dir].tuple.dst.ip, + { .tcp = { htons(dcc_port) } }, + IPPROTO_TCP }}); + exp->mask = ((struct ip_conntrack_tuple) + { { 0, { 0 } }, + { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }}); + exp->expectfn = NULL; + exp->master = ct; + if (ip_nat_irc_hook) + ret = ip_nat_irc_hook(pskb, ctinfo, + addr_beg_p - ib_ptr, + addr_end_p - addr_beg_p, + exp); + else if (ip_conntrack_expect_related(exp) != 0) { + ip_conntrack_expect_free(exp); + ret = NF_DROP; + } + goto out; + } /* for .. NUM_DCCPROTO */ + } /* while data < ... */ + + out: + UNLOCK_BH(&irc_buffer_lock); + return ret; +} + +static struct ip_conntrack_helper irc_helpers[MAX_PORTS]; +static char irc_names[MAX_PORTS][10]; + +static void fini(void); + +static int __init init(void) +{ + int i, ret; + struct ip_conntrack_helper *hlpr; + char *tmpname; + + if (max_dcc_channels < 1) { + printk("ip_conntrack_irc: max_dcc_channels must be a positive integer\n"); + return -EBUSY; + } + if (dcc_timeout < 0) { + printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n"); + return -EBUSY; + } + + /* If no port given, default to standard irc port */ + if (ports_c == 0) + ports[ports_c++] = IRC_PORT; + + for (i = 0; i < ports_c; i++) { + hlpr = &irc_helpers[i]; + hlpr->tuple.src.u.tcp.port = htons(ports[i]); + hlpr->tuple.dst.protonum = IPPROTO_TCP; + hlpr->mask.src.u.tcp.port = 0xFFFF; + hlpr->mask.dst.protonum = 0xFF; + hlpr->max_expected = max_dcc_channels; + hlpr->timeout = dcc_timeout; + hlpr->me = THIS_MODULE; + hlpr->help = help; + + tmpname = &irc_names[i][0]; + if (ports[i] == IRC_PORT) + sprintf(tmpname, "irc"); + else + sprintf(tmpname, "irc-%d", i); + hlpr->name = tmpname; + + DEBUGP("port #%d: %d\n", i, ports[i]); + + ret = ip_conntrack_helper_register(hlpr); + + if (ret) { + printk("ip_conntrack_irc: ERROR registering port %d\n", + ports[i]); + fini(); + return -EBUSY; + } + } + return 0; +} + +/* This function is intentionally _NOT_ defined as __exit, because + * it is needed by the init function */ +static void fini(void) +{ + int i; + for (i = 0; i < ports_c; i++) { + DEBUGP("unregistering port %d\n", + ports[i]); + ip_conntrack_helper_unregister(&irc_helpers[i]); + } +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c new file mode 100644 index 000000000000..88c3712bd251 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_generic.c @@ -0,0 +1,75 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> + +unsigned long ip_ct_generic_timeout = 600*HZ; + +static int generic_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple) +{ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + + return 1; +} + +static int generic_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int generic_print_tuple(struct seq_file *s, + const struct ip_conntrack_tuple *tuple) +{ + return 0; +} + +/* Print out the private part of the conntrack. */ +static int generic_print_conntrack(struct seq_file *s, + const struct ip_conntrack *state) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int packet(struct ip_conntrack *conntrack, + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo) +{ + ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout); + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int new(struct ip_conntrack *conntrack, const struct sk_buff *skb) +{ + return 1; +} + +struct ip_conntrack_protocol ip_conntrack_generic_protocol = +{ + .proto = 0, + .name = "unknown", + .pkt_to_tuple = generic_pkt_to_tuple, + .invert_tuple = generic_invert_tuple, + .print_tuple = generic_print_tuple, + .print_conntrack = generic_print_conntrack, + .packet = packet, + .new = new, +}; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c new file mode 100644 index 000000000000..602c74db3252 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -0,0 +1,279 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/in.h> +#include <linux/icmp.h> +#include <linux/seq_file.h> +#include <net/ip.h> +#include <net/checksum.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> + +unsigned long ip_ct_icmp_timeout = 30*HZ; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int icmp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple) +{ + struct icmphdr _hdr, *hp; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return 0; + + tuple->dst.u.icmp.type = hp->type; + tuple->src.u.icmp.id = hp->un.echo.id; + tuple->dst.u.icmp.code = hp->code; + + return 1; +} + +static int icmp_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + /* Add 1; spaces filled with 0. */ + static u_int8_t invmap[] + = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, + [ICMP_ECHOREPLY] = ICMP_ECHO + 1, + [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, + [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, + [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, + [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, + [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, + [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1}; + + if (orig->dst.u.icmp.type >= sizeof(invmap) + || !invmap[orig->dst.u.icmp.type]) + return 0; + + tuple->src.u.icmp.id = orig->src.u.icmp.id; + tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; + tuple->dst.u.icmp.code = orig->dst.u.icmp.code; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int icmp_print_tuple(struct seq_file *s, + const struct ip_conntrack_tuple *tuple) +{ + return seq_printf(s, "type=%u code=%u id=%u ", + tuple->dst.u.icmp.type, + tuple->dst.u.icmp.code, + ntohs(tuple->src.u.icmp.id)); +} + +/* Print out the private part of the conntrack. */ +static int icmp_print_conntrack(struct seq_file *s, + const struct ip_conntrack *conntrack) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int icmp_packet(struct ip_conntrack *ct, + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo) +{ + /* Try to delete connection immediately after all replies: + won't actually vanish as we still have skb, and del_timer + means this will only run once even if count hits zero twice + (theoretically possible with SMP) */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { + if (atomic_dec_and_test(&ct->proto.icmp.count) + && del_timer(&ct->timeout)) + ct->timeout.function((unsigned long)ct); + } else { + atomic_inc(&ct->proto.icmp.count); + ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); + } + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int icmp_new(struct ip_conntrack *conntrack, + const struct sk_buff *skb) +{ + static u_int8_t valid_new[] + = { [ICMP_ECHO] = 1, + [ICMP_TIMESTAMP] = 1, + [ICMP_INFO_REQUEST] = 1, + [ICMP_ADDRESS] = 1 }; + + if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) + || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { + /* Can't create a new ICMP `conn' with this. */ + DEBUGP("icmp: can't create new conn with type %u\n", + conntrack->tuplehash[0].tuple.dst.u.icmp.type); + DUMP_TUPLE(&conntrack->tuplehash[0].tuple); + return 0; + } + atomic_set(&conntrack->proto.icmp.count, 0); + return 1; +} + +static int +icmp_error_message(struct sk_buff *skb, + enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct ip_conntrack_tuple innertuple, origtuple; + struct { + struct icmphdr icmp; + struct iphdr ip; + } _in, *inside; + struct ip_conntrack_protocol *innerproto; + struct ip_conntrack_tuple_hash *h; + int dataoff; + + IP_NF_ASSERT(skb->nfct == NULL); + + /* Not enough header? */ + inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); + if (inside == NULL) + return NF_ACCEPT; + + /* Ignore ICMP's containing fragments (shouldn't happen) */ + if (inside->ip.frag_off & htons(IP_OFFSET)) { + DEBUGP("icmp_error_track: fragment of proto %u\n", + inside->ip.protocol); + return NF_ACCEPT; + } + + innerproto = ip_ct_find_proto(inside->ip.protocol); + dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4; + /* Are they talking about one of our connections? */ + if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { + DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); + return NF_ACCEPT; + } + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { + DEBUGP("icmp_error_track: Can't invert tuple\n"); + return NF_ACCEPT; + } + + *ctinfo = IP_CT_RELATED; + + h = ip_conntrack_find_get(&innertuple, NULL); + if (!h) { + /* Locally generated ICMPs will match inverted if they + haven't been SNAT'ed yet */ + /* FIXME: NAT code has to handle half-done double NAT --RR */ + if (hooknum == NF_IP_LOCAL_OUT) + h = ip_conntrack_find_get(&origtuple, NULL); + + if (!h) { + DEBUGP("icmp_error_track: no match\n"); + return NF_ACCEPT; + } + /* Reverse direction from that found */ + if (DIRECTION(h) != IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } else { + if (DIRECTION(h) == IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } + + /* Update skb to refer to this connection */ + skb->nfct = &tuplehash_to_ctrack(h)->ct_general; + skb->nfctinfo = *ctinfo; + return -NF_ACCEPT; +} + +/* Small and modified version of icmp_rcv */ +static int +icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct icmphdr _ih, *icmph; + + /* Not enough header? */ + icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); + if (icmph == NULL) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_icmp: short packet "); + return -NF_ACCEPT; + } + + /* See ip_conntrack_proto_tcp.c */ + if (hooknum != NF_IP_PRE_ROUTING) + goto checksum_skipped; + + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!(u16)csum_fold(skb->csum)) + break; + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_icmp: bad HW ICMP checksum "); + return -NF_ACCEPT; + case CHECKSUM_NONE: + if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_icmp: bad ICMP checksum "); + return -NF_ACCEPT; + } + default: + break; + } + +checksum_skipped: + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently + * discarded. + */ + if (icmph->type > NR_ICMP_TYPES) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_icmp: invalid ICMP type "); + return -NF_ACCEPT; + } + + /* Need to track icmp error message? */ + if (icmph->type != ICMP_DEST_UNREACH + && icmph->type != ICMP_SOURCE_QUENCH + && icmph->type != ICMP_TIME_EXCEEDED + && icmph->type != ICMP_PARAMETERPROB + && icmph->type != ICMP_REDIRECT) + return NF_ACCEPT; + + return icmp_error_message(skb, ctinfo, hooknum); +} + +struct ip_conntrack_protocol ip_conntrack_protocol_icmp = +{ + .proto = IPPROTO_ICMP, + .name = "icmp", + .pkt_to_tuple = icmp_pkt_to_tuple, + .invert_tuple = icmp_invert_tuple, + .print_tuple = icmp_print_tuple, + .print_conntrack = icmp_print_conntrack, + .packet = icmp_packet, + .new = icmp_new, + .error = icmp_error, +}; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c new file mode 100644 index 000000000000..ff8c34a860ff --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c @@ -0,0 +1,649 @@ +/* + * Connection tracking protocol helper module for SCTP. + * + * SCTP is defined in RFC 2960. References to various sections in this code + * are to this RFC. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * Added support for proc manipulation of timeouts. + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/sctp.h> +#include <linux/string.h> +#include <linux/seq_file.h> + +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> +#include <linux/netfilter_ipv4/lockhelp.h> + +#if 0 +#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__) +#else +#define DEBUGP(format, args...) +#endif + +/* Protects conntrack->proto.sctp */ +static DECLARE_RWLOCK(sctp_lock); + +/* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR + + And so for me for SCTP :D -Kiran */ + +static const char *sctp_conntrack_names[] = { + "NONE", + "CLOSED", + "COOKIE_WAIT", + "COOKIE_ECHOED", + "ESTABLISHED", + "SHUTDOWN_SENT", + "SHUTDOWN_RECD", + "SHUTDOWN_ACK_SENT", +}; + +#define SECS * HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS +#define DAYS * 24 HOURS + +static unsigned long ip_ct_sctp_timeout_closed = 10 SECS; +static unsigned long ip_ct_sctp_timeout_cookie_wait = 3 SECS; +static unsigned long ip_ct_sctp_timeout_cookie_echoed = 3 SECS; +static unsigned long ip_ct_sctp_timeout_established = 5 DAYS; +static unsigned long ip_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000; +static unsigned long ip_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000; +static unsigned long ip_ct_sctp_timeout_shutdown_ack_sent = 3 SECS; + +static unsigned long * sctp_timeouts[] += { NULL, /* SCTP_CONNTRACK_NONE */ + &ip_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */ + &ip_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */ + &ip_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */ + &ip_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */ + &ip_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */ + &ip_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */ + &ip_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */ + }; + +#define sNO SCTP_CONNTRACK_NONE +#define sCL SCTP_CONNTRACK_CLOSED +#define sCW SCTP_CONNTRACK_COOKIE_WAIT +#define sCE SCTP_CONNTRACK_COOKIE_ECHOED +#define sES SCTP_CONNTRACK_ESTABLISHED +#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT +#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD +#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT +#define sIV SCTP_CONNTRACK_MAX + +/* + These are the descriptions of the states: + +NOTE: These state names are tantalizingly similar to the states of an +SCTP endpoint. But the interpretation of the states is a little different, +considering that these are the states of the connection and not of an end +point. Please note the subtleties. -Kiran + +NONE - Nothing so far. +COOKIE WAIT - We have seen an INIT chunk in the original direction, or also + an INIT_ACK chunk in the reply direction. +COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. +ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. +SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. +SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. +SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite + to that of the SHUTDOWN chunk. +CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of + the SHUTDOWN chunk. Connection is closed. +*/ + +/* TODO + - I have assumed that the first INIT is in the original direction. + This messes things when an INIT comes in the reply direction in CLOSED + state. + - Check the error type in the reply dir before transitioning from +cookie echoed to closed. + - Sec 5.2.4 of RFC 2960 + - Multi Homing support. +*/ + +/* SCTP conntrack state transitions */ +static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { + { +/* ORIGINAL */ +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ +/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL} + }, + { +/* REPLY */ +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA}, +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL} + } +}; + +static int sctp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple) +{ + sctp_sctphdr_t _hdr, *hp; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + if (hp == NULL) + return 0; + + tuple->src.u.sctp.port = hp->source; + tuple->dst.u.sctp.port = hp->dest; + return 1; +} + +static int sctp_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + tuple->src.u.sctp.port = orig->dst.u.sctp.port; + tuple->dst.u.sctp.port = orig->src.u.sctp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int sctp_print_tuple(struct seq_file *s, + const struct ip_conntrack_tuple *tuple) +{ + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.sctp.port), + ntohs(tuple->dst.u.sctp.port)); +} + +/* Print out the private part of the conntrack. */ +static int sctp_print_conntrack(struct seq_file *s, + const struct ip_conntrack *conntrack) +{ + enum sctp_conntrack state; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + READ_LOCK(&sctp_lock); + state = conntrack->proto.sctp.state; + READ_UNLOCK(&sctp_lock); + + return seq_printf(s, "%s ", sctp_conntrack_names[state]); +} + +#define for_each_sctp_chunk(skb, sch, _sch, offset, count) \ +for (offset = skb->nh.iph->ihl * 4 + sizeof(sctp_sctphdr_t), count = 0; \ + offset < skb->len && \ + (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch)); \ + offset += (htons(sch->length) + 3) & ~3, count++) + +/* Some validity checks to make sure the chunks are fine */ +static int do_basic_checks(struct ip_conntrack *conntrack, + const struct sk_buff *skb, + char *map) +{ + u_int32_t offset, count; + sctp_chunkhdr_t _sch, *sch; + int flag; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + flag = 0; + + for_each_sctp_chunk (skb, sch, _sch, offset, count) { + DEBUGP("Chunk Num: %d Type: %d\n", count, sch->type); + + if (sch->type == SCTP_CID_INIT + || sch->type == SCTP_CID_INIT_ACK + || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { + flag = 1; + } + + /* Cookie Ack/Echo chunks not the first OR + Init / Init Ack / Shutdown compl chunks not the only chunks */ + if ((sch->type == SCTP_CID_COOKIE_ACK + || sch->type == SCTP_CID_COOKIE_ECHO + || flag) + && count !=0 ) { + DEBUGP("Basic checks failed\n"); + return 1; + } + + if (map) { + set_bit(sch->type, (void *)map); + } + } + + DEBUGP("Basic checks passed\n"); + return 0; +} + +static int new_state(enum ip_conntrack_dir dir, + enum sctp_conntrack cur_state, + int chunk_type) +{ + int i; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + DEBUGP("Chunk type: %d\n", chunk_type); + + switch (chunk_type) { + case SCTP_CID_INIT: + DEBUGP("SCTP_CID_INIT\n"); + i = 0; break; + case SCTP_CID_INIT_ACK: + DEBUGP("SCTP_CID_INIT_ACK\n"); + i = 1; break; + case SCTP_CID_ABORT: + DEBUGP("SCTP_CID_ABORT\n"); + i = 2; break; + case SCTP_CID_SHUTDOWN: + DEBUGP("SCTP_CID_SHUTDOWN\n"); + i = 3; break; + case SCTP_CID_SHUTDOWN_ACK: + DEBUGP("SCTP_CID_SHUTDOWN_ACK\n"); + i = 4; break; + case SCTP_CID_ERROR: + DEBUGP("SCTP_CID_ERROR\n"); + i = 5; break; + case SCTP_CID_COOKIE_ECHO: + DEBUGP("SCTP_CID_COOKIE_ECHO\n"); + i = 6; break; + case SCTP_CID_COOKIE_ACK: + DEBUGP("SCTP_CID_COOKIE_ACK\n"); + i = 7; break; + case SCTP_CID_SHUTDOWN_COMPLETE: + DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n"); + i = 8; break; + default: + /* Other chunks like DATA, SACK, HEARTBEAT and + its ACK do not cause a change in state */ + DEBUGP("Unknown chunk type, Will stay in %s\n", + sctp_conntrack_names[cur_state]); + return cur_state; + } + + DEBUGP("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", + dir, sctp_conntrack_names[cur_state], chunk_type, + sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); + + return sctp_conntracks[dir][i][cur_state]; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int sctp_packet(struct ip_conntrack *conntrack, + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo) +{ + enum sctp_conntrack newconntrack, oldsctpstate; + struct iphdr *iph = skb->nh.iph; + sctp_sctphdr_t _sctph, *sh; + sctp_chunkhdr_t _sch, *sch; + u_int32_t offset, count; + char map[256 / sizeof (char)] = {0}; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph); + if (sh == NULL) + return -1; + + if (do_basic_checks(conntrack, skb, map) != 0) + return -1; + + /* Check the verification tag (Sec 8.5) */ + if (!test_bit(SCTP_CID_INIT, (void *)map) + && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map) + && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map) + && !test_bit(SCTP_CID_ABORT, (void *)map) + && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map) + && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { + DEBUGP("Verification tag check failed\n"); + return -1; + } + + oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX; + for_each_sctp_chunk (skb, sch, _sch, offset, count) { + WRITE_LOCK(&sctp_lock); + + /* Special cases of Verification tag check (Sec 8.5.1) */ + if (sch->type == SCTP_CID_INIT) { + /* Sec 8.5.1 (A) */ + if (sh->vtag != 0) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + } else if (sch->type == SCTP_CID_ABORT) { + /* Sec 8.5.1 (B) */ + if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) + && !(sh->vtag == conntrack->proto.sctp.vtag + [1 - CTINFO2DIR(ctinfo)])) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { + /* Sec 8.5.1 (C) */ + if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) + && !(sh->vtag == conntrack->proto.sctp.vtag + [1 - CTINFO2DIR(ctinfo)] + && (sch->flags & 1))) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + } else if (sch->type == SCTP_CID_COOKIE_ECHO) { + /* Sec 8.5.1 (D) */ + if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + } + + oldsctpstate = conntrack->proto.sctp.state; + newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type); + + /* Invalid */ + if (newconntrack == SCTP_CONNTRACK_MAX) { + DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n", + CTINFO2DIR(ctinfo), sch->type, oldsctpstate); + WRITE_UNLOCK(&sctp_lock); + return -1; + } + + /* If it is an INIT or an INIT ACK note down the vtag */ + if (sch->type == SCTP_CID_INIT + || sch->type == SCTP_CID_INIT_ACK) { + sctp_inithdr_t _inithdr, *ih; + + ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), + sizeof(_inithdr), &_inithdr); + if (ih == NULL) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + DEBUGP("Setting vtag %x for dir %d\n", + ih->init_tag, !CTINFO2DIR(ctinfo)); + conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag; + } + + conntrack->proto.sctp.state = newconntrack; + WRITE_UNLOCK(&sctp_lock); + } + + ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]); + + if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED + && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY + && newconntrack == SCTP_CONNTRACK_ESTABLISHED) { + DEBUGP("Setting assured bit\n"); + set_bit(IPS_ASSURED_BIT, &conntrack->status); + } + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int sctp_new(struct ip_conntrack *conntrack, + const struct sk_buff *skb) +{ + enum sctp_conntrack newconntrack; + struct iphdr *iph = skb->nh.iph; + sctp_sctphdr_t _sctph, *sh; + sctp_chunkhdr_t _sch, *sch; + u_int32_t offset, count; + char map[256 / sizeof (char)] = {0}; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph); + if (sh == NULL) + return 0; + + if (do_basic_checks(conntrack, skb, map) != 0) + return 0; + + /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ + if ((test_bit (SCTP_CID_ABORT, (void *)map)) + || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)) + || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) { + return 0; + } + + newconntrack = SCTP_CONNTRACK_MAX; + for_each_sctp_chunk (skb, sch, _sch, offset, count) { + /* Don't need lock here: this conntrack not in circulation yet */ + newconntrack = new_state (IP_CT_DIR_ORIGINAL, + SCTP_CONNTRACK_NONE, sch->type); + + /* Invalid: delete conntrack */ + if (newconntrack == SCTP_CONNTRACK_MAX) { + DEBUGP("ip_conntrack_sctp: invalid new deleting.\n"); + return 0; + } + + /* Copy the vtag into the state info */ + if (sch->type == SCTP_CID_INIT) { + if (sh->vtag == 0) { + sctp_inithdr_t _inithdr, *ih; + + ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), + sizeof(_inithdr), &_inithdr); + if (ih == NULL) + return 0; + + DEBUGP("Setting vtag %x for new conn\n", + ih->init_tag); + + conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = + ih->init_tag; + } else { + /* Sec 8.5.1 (A) */ + return 0; + } + } + /* If it is a shutdown ack OOTB packet, we expect a return + shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ + else { + DEBUGP("Setting vtag %x for new conn OOTB\n", + sh->vtag); + conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; + } + + conntrack->proto.sctp.state = newconntrack; + } + + return 1; +} + +static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = { + .proto = IPPROTO_SCTP, + .name = "sctp", + .pkt_to_tuple = sctp_pkt_to_tuple, + .invert_tuple = sctp_invert_tuple, + .print_tuple = sctp_print_tuple, + .print_conntrack = sctp_print_conntrack, + .packet = sctp_packet, + .new = sctp_new, + .destroy = NULL, + .me = THIS_MODULE +}; + +#ifdef CONFIG_SYSCTL +static ctl_table ip_ct_sysctl_table[] = { + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, + .procname = "ip_conntrack_sctp_timeout_closed", + .data = &ip_ct_sctp_timeout_closed, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, + .procname = "ip_conntrack_sctp_timeout_cookie_wait", + .data = &ip_ct_sctp_timeout_cookie_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, + .procname = "ip_conntrack_sctp_timeout_cookie_echoed", + .data = &ip_ct_sctp_timeout_cookie_echoed, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, + .procname = "ip_conntrack_sctp_timeout_established", + .data = &ip_ct_sctp_timeout_established, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, + .procname = "ip_conntrack_sctp_timeout_shutdown_sent", + .data = &ip_ct_sctp_timeout_shutdown_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, + .procname = "ip_conntrack_sctp_timeout_shutdown_recd", + .data = &ip_ct_sctp_timeout_shutdown_recd, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, + .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent", + .data = &ip_ct_sctp_timeout_shutdown_ack_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static ctl_table ip_ct_netfilter_table[] = { + { + .ctl_name = NET_IPV4_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = ip_ct_sysctl_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table ip_ct_ipv4_table[] = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = ip_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table ip_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ip_ct_ipv4_table, + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header *ip_ct_sysctl_header; +#endif + +static int __init init(void) +{ + int ret; + + ret = ip_conntrack_protocol_register(&ip_conntrack_protocol_sctp); + if (ret) { + printk("ip_conntrack_proto_sctp: protocol register failed\n"); + goto out; + } + +#ifdef CONFIG_SYSCTL + ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0); + if (ip_ct_sysctl_header == NULL) { + ret = -ENOMEM; + printk("ip_conntrack_proto_sctp: can't register to sysctl.\n"); + goto cleanup; + } +#endif + + return ret; + +#ifdef CONFIG_SYSCTL + cleanup: + ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp); +#endif + out: + DEBUGP("SCTP conntrack module loading %s\n", + ret ? "failed": "succeeded"); + return ret; +} + +static void __exit fini(void) +{ + ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(ip_ct_sysctl_header); +#endif + DEBUGP("SCTP conntrack module unloaded\n"); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kiran Kumar Immidi"); +MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP"); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c new file mode 100644 index 000000000000..e800b16fc920 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -0,0 +1,1098 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>: + * - Real stateful connection tracking + * - Modified state transitions table + * - Window scaling support added + * - SACK support added + * + * Willy Tarreau: + * - State table bugfixes + * - More robust state changes + * - Tuning timer parameters + * + * version 2.2 + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/spinlock.h> + +#include <net/tcp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> +#include <linux/netfilter_ipv4/lockhelp.h> + +#if 0 +#define DEBUGP printk +#define DEBUGP_VARS +#else +#define DEBUGP(format, args...) +#endif + +/* Protects conntrack->proto.tcp */ +static DECLARE_RWLOCK(tcp_lock); + +/* "Be conservative in what you do, + be liberal in what you accept from others." + If it's non-zero, we mark only out of window RST segments as INVALID. */ +int ip_ct_tcp_be_liberal = 0; + +/* When connection is picked up from the middle, how many packets are required + to pass in each direction when we assume we are in sync - if any side uses + window scaling, we lost the game. + If it is set to zero, we disable picking up already established + connections. */ +int ip_ct_tcp_loose = 3; + +/* Max number of the retransmitted packets without receiving an (acceptable) + ACK from the destination. If this number is reached, a shorter timer + will be started. */ +int ip_ct_tcp_max_retrans = 3; + + /* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR */ + +static const char *tcp_conntrack_names[] = { + "NONE", + "SYN_SENT", + "SYN_RECV", + "ESTABLISHED", + "FIN_WAIT", + "CLOSE_WAIT", + "LAST_ACK", + "TIME_WAIT", + "CLOSE", + "LISTEN" +}; + +#define SECS * HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS +#define DAYS * 24 HOURS + +unsigned long ip_ct_tcp_timeout_syn_sent = 2 MINS; +unsigned long ip_ct_tcp_timeout_syn_recv = 60 SECS; +unsigned long ip_ct_tcp_timeout_established = 5 DAYS; +unsigned long ip_ct_tcp_timeout_fin_wait = 2 MINS; +unsigned long ip_ct_tcp_timeout_close_wait = 60 SECS; +unsigned long ip_ct_tcp_timeout_last_ack = 30 SECS; +unsigned long ip_ct_tcp_timeout_time_wait = 2 MINS; +unsigned long ip_ct_tcp_timeout_close = 10 SECS; + +/* RFC1122 says the R2 limit should be at least 100 seconds. + Linux uses 15 packets as limit, which corresponds + to ~13-30min depending on RTO. */ +unsigned long ip_ct_tcp_timeout_max_retrans = 5 MINS; + +static unsigned long * tcp_timeouts[] += { NULL, /* TCP_CONNTRACK_NONE */ + &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ + &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ + &ip_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */ + &ip_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */ + &ip_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */ + &ip_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */ + &ip_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */ + &ip_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */ + NULL, /* TCP_CONNTRACK_LISTEN */ + }; + +#define sNO TCP_CONNTRACK_NONE +#define sSS TCP_CONNTRACK_SYN_SENT +#define sSR TCP_CONNTRACK_SYN_RECV +#define sES TCP_CONNTRACK_ESTABLISHED +#define sFW TCP_CONNTRACK_FIN_WAIT +#define sCW TCP_CONNTRACK_CLOSE_WAIT +#define sLA TCP_CONNTRACK_LAST_ACK +#define sTW TCP_CONNTRACK_TIME_WAIT +#define sCL TCP_CONNTRACK_CLOSE +#define sLI TCP_CONNTRACK_LISTEN +#define sIV TCP_CONNTRACK_MAX +#define sIG TCP_CONNTRACK_IGNORE + +/* What TCP flags are set from RST/SYN/FIN/ACK. */ +enum tcp_bit_set { + TCP_SYN_SET, + TCP_SYNACK_SET, + TCP_FIN_SET, + TCP_ACK_SET, + TCP_RST_SET, + TCP_NONE_SET, +}; + +/* + * The TCP state transition table needs a few words... + * + * We are the man in the middle. All the packets go through us + * but might get lost in transit to the destination. + * It is assumed that the destinations can't receive segments + * we haven't seen. + * + * The checked segment is in window, but our windows are *not* + * equivalent with the ones of the sender/receiver. We always + * try to guess the state of the current sender. + * + * The meaning of the states are: + * + * NONE: initial state + * SYN_SENT: SYN-only packet seen + * SYN_RECV: SYN-ACK packet seen + * ESTABLISHED: ACK packet seen + * FIN_WAIT: FIN packet seen + * CLOSE_WAIT: ACK seen (after FIN) + * LAST_ACK: FIN seen (after FIN) + * TIME_WAIT: last ACK seen + * CLOSE: closed connection + * + * LISTEN state is not used. + * + * Packets marked as IGNORED (sIG): + * if they may be either invalid or valid + * and the receiver may send back a connection + * closing RST or a SYN/ACK. + * + * Packets marked as INVALID (sIV): + * if they are invalid + * or we do not support the request (simultaneous open) + */ +static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { + { +/* ORIGINAL */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV }, +/* + * sNO -> sSS Initialize a new connection + * sSS -> sSS Retransmitted SYN + * sSR -> sIG Late retransmitted SYN? + * sES -> sIG Error: SYNs in window outside the SYN_SENT state + * are errors. Receiver will reply with RST + * and close the connection. + * Or we are not in sync and hold a dead connection. + * sFW -> sIG + * sCW -> sIG + * sLA -> sIG + * sTW -> sSS Reopened connection (RFC 1122). + * sCL -> sSS + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* + * A SYN/ACK from the client is always invalid: + * - either it tries to set up a simultaneous open, which is + * not supported; + * - or the firewall has just been inserted between the two hosts + * during the session set-up. The SYN will be retransmitted + * by the true client (or it'll time out). + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, +/* + * sNO -> sIV Too late and no reason to do anything... + * sSS -> sIV Client migth not send FIN in this state: + * we enforce waiting for a SYN/ACK reply first. + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions, waiting for + * the last ACK. + * Migth be a retransmitted FIN as well... + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. Remain in the same state. + * sTW -> sTW + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* + * sNO -> sES Assumed. + * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. + * sSR -> sES Established state is reached. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. Remain in the same state. + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + }, + { +/* REPLY */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* + * sNO -> sIV Never reached. + * sSS -> sIV Simultaneous open, not supported + * sSR -> sIV Simultaneous open, not supported. + * sES -> sIV Server may not initiate a connection. + * sFW -> sIV + * sCW -> sIV + * sLA -> sIV + * sTW -> sIV Reopened connection, but server may not do it. + * sCL -> sIV + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV }, +/* + * sSS -> sSR Standard open. + * sSR -> sSR Retransmitted SYN/ACK. + * sES -> sIG Late retransmitted SYN/ACK? + * sFW -> sIG Might be SYN/ACK answering ignored SYN + * sCW -> sIG + * sLA -> sIG + * sTW -> sIG + * sCL -> sIG + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, +/* + * sSS -> sIV Server might not send FIN in this state. + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions. + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. + * sTW -> sTW + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*ack*/ { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* + * sSS -> sIV Might be a half-open connection. + * sSR -> sSR Might answer late resent SYN. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + } +}; + +static int tcp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple) +{ + struct tcphdr _hdr, *hp; + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + if (hp == NULL) + return 0; + + tuple->src.u.tcp.port = hp->source; + tuple->dst.u.tcp.port = hp->dest; + + return 1; +} + +static int tcp_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + tuple->src.u.tcp.port = orig->dst.u.tcp.port; + tuple->dst.u.tcp.port = orig->src.u.tcp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int tcp_print_tuple(struct seq_file *s, + const struct ip_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.tcp.port), + ntohs(tuple->dst.u.tcp.port)); +} + +/* Print out the private part of the conntrack. */ +static int tcp_print_conntrack(struct seq_file *s, + const struct ip_conntrack *conntrack) +{ + enum tcp_conntrack state; + + READ_LOCK(&tcp_lock); + state = conntrack->proto.tcp.state; + READ_UNLOCK(&tcp_lock); + + return seq_printf(s, "%s ", tcp_conntrack_names[state]); +} + +static unsigned int get_conntrack_index(const struct tcphdr *tcph) +{ + if (tcph->rst) return TCP_RST_SET; + else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); + else if (tcph->fin) return TCP_FIN_SET; + else if (tcph->ack) return TCP_ACK_SET; + else return TCP_NONE_SET; +} + +/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering + in IP Filter' by Guido van Rooij. + + http://www.nluug.nl/events/sane2000/papers.html + http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz + + The boundaries and the conditions are changed according to RFC793: + the packet must intersect the window (i.e. segments may be + after the right or before the left edge) and thus receivers may ACK + segments after the right edge of the window. + + td_maxend = max(sack + max(win,1)) seen in reply packets + td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets + td_maxwin += seq + len - sender.td_maxend + if seq + len > sender.td_maxend + td_end = max(seq + len) seen in sent packets + + I. Upper bound for valid data: seq <= sender.td_maxend + II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin + III. Upper bound for valid ack: sack <= receiver.td_end + IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW + + where sack is the highest right edge of sack block found in the packet. + + The upper bound limit for a valid ack is not ignored - + we doesn't have to deal with fragments. +*/ + +static inline __u32 segment_seq_plus_len(__u32 seq, + size_t len, + struct iphdr *iph, + struct tcphdr *tcph) +{ + return (seq + len - (iph->ihl + tcph->doff)*4 + + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); +} + +/* Fixme: what about big packets? */ +#define MAXACKWINCONST 66000 +#define MAXACKWINDOW(sender) \ + ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ + : MAXACKWINCONST) + +/* + * Simplified tcp_parse_options routine from tcp_input.c + */ +static void tcp_options(const struct sk_buff *skb, + struct iphdr *iph, + struct tcphdr *tcph, + struct ip_ct_tcp_state *state) +{ + unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; + unsigned char *ptr; + int length = (tcph->doff*4) - sizeof(struct tcphdr); + + if (!length) + return; + + ptr = skb_header_pointer(skb, + (iph->ihl * 4) + sizeof(struct tcphdr), + length, buff); + BUG_ON(ptr == NULL); + + state->td_scale = + state->flags = 0; + + while (length > 0) { + int opcode=*ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ + + if (opcode == TCPOPT_SACK_PERM + && opsize == TCPOLEN_SACK_PERM) + state->flags |= IP_CT_TCP_FLAG_SACK_PERM; + else if (opcode == TCPOPT_WINDOW + && opsize == TCPOLEN_WINDOW) { + state->td_scale = *(u_int8_t *)ptr; + + if (state->td_scale > 14) { + /* See RFC1323 */ + state->td_scale = 14; + } + state->flags |= + IP_CT_TCP_FLAG_WINDOW_SCALE; + } + ptr += opsize - 2; + length -= opsize; + } + } +} + +static void tcp_sack(const struct sk_buff *skb, + struct iphdr *iph, + struct tcphdr *tcph, + __u32 *sack) +{ + unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; + unsigned char *ptr; + int length = (tcph->doff*4) - sizeof(struct tcphdr); + __u32 tmp; + + if (!length) + return; + + ptr = skb_header_pointer(skb, + (iph->ihl * 4) + sizeof(struct tcphdr), + length, buff); + BUG_ON(ptr == NULL); + + /* Fast path for timestamp-only option */ + if (length == TCPOLEN_TSTAMP_ALIGNED*4 + && *(__u32 *)ptr == + __constant_ntohl((TCPOPT_NOP << 24) + | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) + | TCPOLEN_TIMESTAMP)) + return; + + while (length > 0) { + int opcode=*ptr++; + int opsize, i; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ + + if (opcode == TCPOPT_SACK + && opsize >= (TCPOLEN_SACK_BASE + + TCPOLEN_SACK_PERBLOCK) + && !((opsize - TCPOLEN_SACK_BASE) + % TCPOLEN_SACK_PERBLOCK)) { + for (i = 0; + i < (opsize - TCPOLEN_SACK_BASE); + i += TCPOLEN_SACK_PERBLOCK) { + tmp = ntohl(*((u_int32_t *)(ptr+i)+1)); + + if (after(tmp, *sack)) + *sack = tmp; + } + return; + } + ptr += opsize - 2; + length -= opsize; + } + } +} + +static int tcp_in_window(struct ip_ct_tcp *state, + enum ip_conntrack_dir dir, + unsigned int index, + const struct sk_buff *skb, + struct iphdr *iph, + struct tcphdr *tcph) +{ + struct ip_ct_tcp_state *sender = &state->seen[dir]; + struct ip_ct_tcp_state *receiver = &state->seen[!dir]; + __u32 seq, ack, sack, end, win, swin; + int res; + + /* + * Get the required data from the packet. + */ + seq = ntohl(tcph->seq); + ack = sack = ntohl(tcph->ack_seq); + win = ntohs(tcph->window); + end = segment_seq_plus_len(seq, skb->len, iph, tcph); + + if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) + tcp_sack(skb, iph, tcph, &sack); + + DEBUGP("tcp_in_window: START\n"); + DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "seq=%u ack=%u sack=%u win=%u end=%u\n", + NIPQUAD(iph->saddr), ntohs(tcph->source), + NIPQUAD(iph->daddr), ntohs(tcph->dest), + seq, ack, sack, win, end); + DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + if (sender->td_end == 0) { + /* + * Initialize sender data. + */ + if (tcph->syn && tcph->ack) { + /* + * Outgoing SYN-ACK in reply to a SYN. + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, iph, tcph, sender); + /* + * RFC 1323: + * Both sides must send the Window Scale option + * to enable window scaling in either direction. + */ + if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE + && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) + sender->td_scale = + receiver->td_scale = 0; + } else { + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + sender->td_end = end; + sender->td_maxwin = (win == 0 ? 1 : win); + sender->td_maxend = end + sender->td_maxwin; + } + } else if (((state->state == TCP_CONNTRACK_SYN_SENT + && dir == IP_CT_DIR_ORIGINAL) + || (state->state == TCP_CONNTRACK_SYN_RECV + && dir == IP_CT_DIR_REPLY)) + && after(end, sender->td_end)) { + /* + * RFC 793: "if a TCP is reinitialized ... then it need + * not wait at all; it must only be sure to use sequence + * numbers larger than those recently used." + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, iph, tcph, sender); + } + + if (!(tcph->ack)) { + /* + * If there is no ACK, just pretend it was set and OK. + */ + ack = sack = receiver->td_end; + } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == + (TCP_FLAG_ACK|TCP_FLAG_RST)) + && (ack == 0)) { + /* + * Broken TCP stacks, that set ACK in RST packets as well + * with zero ack value. + */ + ack = sack = receiver->td_end; + } + + if (seq == end + && (!tcph->rst + || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT))) + /* + * Packets contains no data: we assume it is valid + * and check the ack value only. + * However RST segments are always validated by their + * SEQ number, except when seq == 0 (reset sent answering + * SYN. + */ + seq = end = sender->td_end; + + DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "seq=%u ack=%u sack =%u win=%u end=%u\n", + NIPQUAD(iph->saddr), ntohs(tcph->source), + NIPQUAD(iph->daddr), ntohs(tcph->dest), + seq, ack, sack, win, end); + DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n", + before(seq, sender->td_maxend + 1), + after(end, sender->td_end - receiver->td_maxwin - 1), + before(sack, receiver->td_end + 1), + after(ack, receiver->td_end - MAXACKWINDOW(sender))); + + if (sender->loose || receiver->loose || + (before(seq, sender->td_maxend + 1) && + after(end, sender->td_end - receiver->td_maxwin - 1) && + before(sack, receiver->td_end + 1) && + after(ack, receiver->td_end - MAXACKWINDOW(sender)))) { + /* + * Take into account window scaling (RFC 1323). + */ + if (!tcph->syn) + win <<= sender->td_scale; + + /* + * Update sender data. + */ + swin = win + (sack - ack); + if (sender->td_maxwin < swin) + sender->td_maxwin = swin; + if (after(end, sender->td_end)) + sender->td_end = end; + /* + * Update receiver data. + */ + if (after(end, sender->td_maxend)) + receiver->td_maxwin += end - sender->td_maxend; + if (after(sack + win, receiver->td_maxend - 1)) { + receiver->td_maxend = sack + win; + if (win == 0) + receiver->td_maxend++; + } + + /* + * Check retransmissions. + */ + if (index == TCP_ACK_SET) { + if (state->last_dir == dir + && state->last_seq == seq + && state->last_ack == ack + && state->last_end == end) + state->retrans++; + else { + state->last_dir = dir; + state->last_seq = seq; + state->last_ack = ack; + state->last_end = end; + state->retrans = 0; + } + } + /* + * Close the window of disabled window tracking :-) + */ + if (sender->loose) + sender->loose--; + + res = 1; + } else { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: %s ", + before(seq, sender->td_maxend + 1) ? + after(end, sender->td_end - receiver->td_maxwin - 1) ? + before(sack, receiver->td_end + 1) ? + after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG" + : "ACK is under the lower bound (possible overly delayed ACK)" + : "ACK is over the upper bound (ACKed data not seen yet)" + : "SEQ is under the lower bound (already ACKed data retransmitted)" + : "SEQ is over the upper bound (over the window of the receiver)"); + + res = ip_ct_tcp_be_liberal; + } + + DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u " + "receiver end=%u maxend=%u maxwin=%u\n", + res, sender->td_end, sender->td_maxend, sender->td_maxwin, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin); + + return res; +} + +#ifdef CONFIG_IP_NF_NAT_NEEDED +/* Update sender->td_end after NAT successfully mangled the packet */ +void ip_conntrack_tcp_update(struct sk_buff *skb, + struct ip_conntrack *conntrack, + enum ip_conntrack_dir dir) +{ + struct iphdr *iph = skb->nh.iph; + struct tcphdr *tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4; + __u32 end; +#ifdef DEBUGP_VARS + struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir]; + struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir]; +#endif + + end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph); + + WRITE_LOCK(&tcp_lock); + /* + * We have to worry for the ack in the reply packet only... + */ + if (after(end, conntrack->proto.tcp.seen[dir].td_end)) + conntrack->proto.tcp.seen[dir].td_end = end; + conntrack->proto.tcp.last_end = end; + WRITE_UNLOCK(&tcp_lock); + DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); +} + +#endif + +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_PUSH 0x08 +#define TH_ACK 0x10 +#define TH_URG 0x20 +#define TH_ECE 0x40 +#define TH_CWR 0x80 + +/* table of valid flag combinations - ECE and CWR are always valid */ +static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = +{ + [TH_SYN] = 1, + [TH_SYN|TH_ACK] = 1, + [TH_RST] = 1, + [TH_RST|TH_ACK] = 1, + [TH_RST|TH_ACK|TH_PUSH] = 1, + [TH_FIN|TH_ACK] = 1, + [TH_ACK] = 1, + [TH_ACK|TH_PUSH] = 1, + [TH_ACK|TH_URG] = 1, + [TH_ACK|TH_URG|TH_PUSH] = 1, + [TH_FIN|TH_ACK|TH_PUSH] = 1, + [TH_FIN|TH_ACK|TH_URG] = 1, + [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1, +}; + +/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ +static int tcp_error(struct sk_buff *skb, + enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct iphdr *iph = skb->nh.iph; + struct tcphdr _tcph, *th; + unsigned int tcplen = skb->len - iph->ihl * 4; + u_int8_t tcpflags; + + /* Smaller that minimal TCP header? */ + th = skb_header_pointer(skb, iph->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: short packet "); + return -NF_ACCEPT; + } + + /* Not whole TCP header or malformed packet */ + if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: truncated/malformed packet "); + return -NF_ACCEPT; + } + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the semantic of CHECKSUM_HW is different there + * and moreover root might send raw packets. + */ + /* FIXME: Source route IP option packets --RR */ + if (hooknum == NF_IP_PRE_ROUTING + && csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, iph->ihl*4, tcplen, 0))) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: bad TCP checksum "); + return -NF_ACCEPT; + } + + /* Check TCP flags. */ + tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); + if (!tcp_valid_flags[tcpflags]) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: invalid TCP flag combination "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int tcp_packet(struct ip_conntrack *conntrack, + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo) +{ + enum tcp_conntrack new_state, old_state; + enum ip_conntrack_dir dir; + struct iphdr *iph = skb->nh.iph; + struct tcphdr *th, _tcph; + unsigned long timeout; + unsigned int index; + + th = skb_header_pointer(skb, iph->ihl * 4, + sizeof(_tcph), &_tcph); + BUG_ON(th == NULL); + + WRITE_LOCK(&tcp_lock); + old_state = conntrack->proto.tcp.state; + dir = CTINFO2DIR(ctinfo); + index = get_conntrack_index(th); + new_state = tcp_conntracks[dir][index][old_state]; + + switch (new_state) { + case TCP_CONNTRACK_IGNORE: + /* Either SYN in ORIGINAL + * or SYN/ACK in REPLY. */ + if (index == TCP_SYNACK_SET + && conntrack->proto.tcp.last_index == TCP_SYN_SET + && conntrack->proto.tcp.last_dir != dir + && ntohl(th->ack_seq) == + conntrack->proto.tcp.last_end) { + /* This SYN/ACK acknowledges a SYN that we earlier + * ignored as invalid. This means that the client and + * the server are both in sync, while the firewall is + * not. We kill this session and block the SYN/ACK so + * that the client cannot but retransmit its SYN and + * thus initiate a clean new session. + */ + WRITE_UNLOCK(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: killing out of sync session "); + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return -NF_DROP; + } + conntrack->proto.tcp.last_index = index; + conntrack->proto.tcp.last_dir = dir; + conntrack->proto.tcp.last_seq = ntohl(th->seq); + conntrack->proto.tcp.last_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th); + + WRITE_UNLOCK(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: invalid packet ignored "); + return NF_ACCEPT; + case TCP_CONNTRACK_MAX: + /* Invalid packet */ + DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", + dir, get_conntrack_index(th), + old_state); + WRITE_UNLOCK(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: invalid state "); + return -NF_ACCEPT; + case TCP_CONNTRACK_SYN_SENT: + if (old_state < TCP_CONNTRACK_TIME_WAIT) + break; + if ((conntrack->proto.tcp.seen[dir].flags & + IP_CT_TCP_FLAG_CLOSE_INIT) + || after(ntohl(th->seq), + conntrack->proto.tcp.seen[dir].td_end)) { + /* Attempt to reopen a closed connection. + * Delete this connection and look up again. */ + WRITE_UNLOCK(&tcp_lock); + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return -NF_REPEAT; + } else { + WRITE_UNLOCK(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: invalid SYN"); + return -NF_ACCEPT; + } + case TCP_CONNTRACK_CLOSE: + if (index == TCP_RST_SET + && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) + && conntrack->proto.tcp.last_index == TCP_SYN_SET + && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) { + /* RST sent to invalid SYN we had let trough + * SYN was in window then, tear down connection. + * We skip window checking, because packet might ACK + * segments we ignored in the SYN. */ + goto in_window; + } + /* Just fall trough */ + default: + /* Keep compilers happy. */ + break; + } + + if (!tcp_in_window(&conntrack->proto.tcp, dir, index, + skb, iph, th)) { + WRITE_UNLOCK(&tcp_lock); + return -NF_ACCEPT; + } + in_window: + /* From now on we have got in-window packets */ + conntrack->proto.tcp.last_index = index; + + DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", + NIPQUAD(iph->saddr), ntohs(th->source), + NIPQUAD(iph->daddr), ntohs(th->dest), + (th->syn ? 1 : 0), (th->ack ? 1 : 0), + (th->fin ? 1 : 0), (th->rst ? 1 : 0), + old_state, new_state); + + conntrack->proto.tcp.state = new_state; + if (old_state != new_state + && (new_state == TCP_CONNTRACK_FIN_WAIT + || new_state == TCP_CONNTRACK_CLOSE)) + conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; + timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans + && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans + ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; + WRITE_UNLOCK(&tcp_lock); + + if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { + /* If only reply is a RST, we can consider ourselves not to + have an established connection: this is a fairly common + problem case, so we can delete the conntrack + immediately. --RR */ + if (th->rst) { + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return NF_ACCEPT; + } + } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status) + && (old_state == TCP_CONNTRACK_SYN_RECV + || old_state == TCP_CONNTRACK_ESTABLISHED) + && new_state == TCP_CONNTRACK_ESTABLISHED) { + /* Set ASSURED if we see see valid ack in ESTABLISHED + after SYN_RECV or a valid answer for a picked up + connection. */ + set_bit(IPS_ASSURED_BIT, &conntrack->status); + } + ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int tcp_new(struct ip_conntrack *conntrack, + const struct sk_buff *skb) +{ + enum tcp_conntrack new_state; + struct iphdr *iph = skb->nh.iph; + struct tcphdr *th, _tcph; +#ifdef DEBUGP_VARS + struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0]; + struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1]; +#endif + + th = skb_header_pointer(skb, iph->ihl * 4, + sizeof(_tcph), &_tcph); + BUG_ON(th == NULL); + + /* Don't need lock here: this conntrack not in circulation yet */ + new_state + = tcp_conntracks[0][get_conntrack_index(th)] + [TCP_CONNTRACK_NONE]; + + /* Invalid: delete conntrack */ + if (new_state >= TCP_CONNTRACK_MAX) { + DEBUGP("ip_ct_tcp: invalid new deleting.\n"); + return 0; + } + + if (new_state == TCP_CONNTRACK_SYN_SENT) { + /* SYN packet */ + conntrack->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + iph, th); + conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (conntrack->proto.tcp.seen[0].td_maxwin == 0) + conntrack->proto.tcp.seen[0].td_maxwin = 1; + conntrack->proto.tcp.seen[0].td_maxend = + conntrack->proto.tcp.seen[0].td_end; + + tcp_options(skb, iph, th, &conntrack->proto.tcp.seen[0]); + conntrack->proto.tcp.seen[1].flags = 0; + conntrack->proto.tcp.seen[0].loose = + conntrack->proto.tcp.seen[1].loose = 0; + } else if (ip_ct_tcp_loose == 0) { + /* Don't try to pick up connections. */ + return 0; + } else { + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + conntrack->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + iph, th); + conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (conntrack->proto.tcp.seen[0].td_maxwin == 0) + conntrack->proto.tcp.seen[0].td_maxwin = 1; + conntrack->proto.tcp.seen[0].td_maxend = + conntrack->proto.tcp.seen[0].td_end + + conntrack->proto.tcp.seen[0].td_maxwin; + conntrack->proto.tcp.seen[0].td_scale = 0; + + /* We assume SACK. Should we assume window scaling too? */ + conntrack->proto.tcp.seen[0].flags = + conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM; + conntrack->proto.tcp.seen[0].loose = + conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose; + } + + conntrack->proto.tcp.seen[1].td_end = 0; + conntrack->proto.tcp.seen[1].td_maxend = 0; + conntrack->proto.tcp.seen[1].td_maxwin = 1; + conntrack->proto.tcp.seen[1].td_scale = 0; + + /* tcp_packet will set them */ + conntrack->proto.tcp.state = TCP_CONNTRACK_NONE; + conntrack->proto.tcp.last_index = TCP_NONE_SET; + + DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + return 1; +} + +struct ip_conntrack_protocol ip_conntrack_protocol_tcp = +{ + .proto = IPPROTO_TCP, + .name = "tcp", + .pkt_to_tuple = tcp_pkt_to_tuple, + .invert_tuple = tcp_invert_tuple, + .print_tuple = tcp_print_tuple, + .print_conntrack = tcp_print_conntrack, + .packet = tcp_packet, + .new = tcp_new, + .error = tcp_error, +}; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c new file mode 100644 index 000000000000..5bc28a224623 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -0,0 +1,146 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/netfilter.h> +#include <linux/in.h> +#include <linux/udp.h> +#include <linux/seq_file.h> +#include <net/checksum.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> + +unsigned long ip_ct_udp_timeout = 30*HZ; +unsigned long ip_ct_udp_timeout_stream = 180*HZ; + +static int udp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple) +{ + struct udphdr _hdr, *hp; + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return 0; + + tuple->src.u.udp.port = hp->source; + tuple->dst.u.udp.port = hp->dest; + + return 1; +} + +static int udp_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + tuple->src.u.udp.port = orig->dst.u.udp.port; + tuple->dst.u.udp.port = orig->src.u.udp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int udp_print_tuple(struct seq_file *s, + const struct ip_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.udp.port), + ntohs(tuple->dst.u.udp.port)); +} + +/* Print out the private part of the conntrack. */ +static int udp_print_conntrack(struct seq_file *s, + const struct ip_conntrack *conntrack) +{ + return 0; +} + +/* Returns verdict for packet, and may modify conntracktype */ +static int udp_packet(struct ip_conntrack *conntrack, + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo) +{ + /* If we've seen traffic both ways, this is some kind of UDP + stream. Extend timeout. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { + ip_ct_refresh_acct(conntrack, ctinfo, skb, + ip_ct_udp_timeout_stream); + /* Also, more likely to be important, and not a probe */ + set_bit(IPS_ASSURED_BIT, &conntrack->status); + } else + ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int udp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb) +{ + return 1; +} + +static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct iphdr *iph = skb->nh.iph; + unsigned int udplen = skb->len - iph->ihl * 4; + struct udphdr _hdr, *hdr; + + /* Header is too small? */ + hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr); + if (hdr == NULL) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_udp: short packet "); + return -NF_ACCEPT; + } + + /* Truncated/malformed packets */ + if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_udp: truncated/malformed packet "); + return -NF_ACCEPT; + } + + /* Packet with no checksum */ + if (!hdr->check) + return NF_ACCEPT; + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the semantic of CHECKSUM_HW is different there + * and moreover root might send raw packets. + * FIXME: Source route IP option packets --RR */ + if (hooknum == NF_IP_PRE_ROUTING + && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, iph->ihl*4, udplen, 0))) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_udp: bad UDP checksum "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +struct ip_conntrack_protocol ip_conntrack_protocol_udp = +{ + .proto = IPPROTO_UDP, + .name = "udp", + .pkt_to_tuple = udp_pkt_to_tuple, + .invert_tuple = udp_invert_tuple, + .print_tuple = udp_print_tuple, + .print_conntrack = udp_print_conntrack, + .packet = udp_packet, + .new = udp_new, + .error = udp_error, +}; diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c new file mode 100644 index 000000000000..80a7bde2a57a --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -0,0 +1,961 @@ +/* This file contains all the functions required for the standalone + ip_conntrack module. + + These are not required by the compatibility layer. +*/ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/percpu.h> +#ifdef CONFIG_SYSCTL +#include <linux/sysctl.h> +#endif +#include <net/checksum.h> +#include <net/ip.h> + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) + +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +MODULE_LICENSE("GPL"); + +extern atomic_t ip_conntrack_count; +DECLARE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); + +static int kill_proto(struct ip_conntrack *i, void *data) +{ + return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == + *((u_int8_t *) data)); +} + +#ifdef CONFIG_PROC_FS +static int +print_tuple(struct seq_file *s, const struct ip_conntrack_tuple *tuple, + struct ip_conntrack_protocol *proto) +{ + seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ", + NIPQUAD(tuple->src.ip), NIPQUAD(tuple->dst.ip)); + return proto->print_tuple(s, tuple); +} + +#ifdef CONFIG_IP_NF_CT_ACCT +static unsigned int +seq_print_counters(struct seq_file *s, + const struct ip_conntrack_counter *counter) +{ + return seq_printf(s, "packets=%llu bytes=%llu ", + (unsigned long long)counter->packets, + (unsigned long long)counter->bytes); +} +#else +#define seq_print_counters(x, y) 0 +#endif + +struct ct_iter_state { + unsigned int bucket; +}; + +static struct list_head *ct_get_first(struct seq_file *seq) +{ + struct ct_iter_state *st = seq->private; + + for (st->bucket = 0; + st->bucket < ip_conntrack_htable_size; + st->bucket++) { + if (!list_empty(&ip_conntrack_hash[st->bucket])) + return ip_conntrack_hash[st->bucket].next; + } + return NULL; +} + +static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head) +{ + struct ct_iter_state *st = seq->private; + + head = head->next; + while (head == &ip_conntrack_hash[st->bucket]) { + if (++st->bucket >= ip_conntrack_htable_size) + return NULL; + head = ip_conntrack_hash[st->bucket].next; + } + return head; +} + +static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos) +{ + struct list_head *head = ct_get_first(seq); + + if (head) + while (pos && (head = ct_get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *ct_seq_start(struct seq_file *seq, loff_t *pos) +{ + READ_LOCK(&ip_conntrack_lock); + return ct_get_idx(seq, *pos); +} + +static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return ct_get_next(s, v); +} + +static void ct_seq_stop(struct seq_file *s, void *v) +{ + READ_UNLOCK(&ip_conntrack_lock); +} + +static int ct_seq_show(struct seq_file *s, void *v) +{ + const struct ip_conntrack_tuple_hash *hash = v; + const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash); + struct ip_conntrack_protocol *proto; + + MUST_BE_READ_LOCKED(&ip_conntrack_lock); + IP_NF_ASSERT(conntrack); + + /* we only want to print DIR_ORIGINAL */ + if (DIRECTION(hash)) + return 0; + + proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.protonum); + IP_NF_ASSERT(proto); + + if (seq_printf(s, "%-8s %u %ld ", + proto->name, + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum, + timer_pending(&conntrack->timeout) + ? (long)(conntrack->timeout.expires - jiffies)/HZ + : 0) != 0) + return -ENOSPC; + + if (proto->print_conntrack(s, conntrack)) + return -ENOSPC; + + if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + proto)) + return -ENOSPC; + + if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL])) + return -ENOSPC; + + if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status))) + if (seq_printf(s, "[UNREPLIED] ")) + return -ENOSPC; + + if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple, + proto)) + return -ENOSPC; + + if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY])) + return -ENOSPC; + + if (test_bit(IPS_ASSURED_BIT, &conntrack->status)) + if (seq_printf(s, "[ASSURED] ")) + return -ENOSPC; + +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + if (seq_printf(s, "mark=%lu ", conntrack->mark)) + return -ENOSPC; +#endif + + if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) + return -ENOSPC; + + return 0; +} + +static struct seq_operations ct_seq_ops = { + .start = ct_seq_start, + .next = ct_seq_next, + .stop = ct_seq_stop, + .show = ct_seq_show +}; + +static int ct_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct ct_iter_state *st; + int ret; + + st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL); + if (st == NULL) + return -ENOMEM; + ret = seq_open(file, &ct_seq_ops); + if (ret) + goto out_free; + seq = file->private_data; + seq->private = st; + memset(st, 0, sizeof(struct ct_iter_state)); + return ret; +out_free: + kfree(st); + return ret; +} + +static struct file_operations ct_file_ops = { + .owner = THIS_MODULE, + .open = ct_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +/* expects */ +static void *exp_seq_start(struct seq_file *s, loff_t *pos) +{ + struct list_head *e = &ip_conntrack_expect_list; + loff_t i; + + /* strange seq_file api calls stop even if we fail, + * thus we need to grab lock since stop unlocks */ + READ_LOCK(&ip_conntrack_lock); + + if (list_empty(e)) + return NULL; + + for (i = 0; i <= *pos; i++) { + e = e->next; + if (e == &ip_conntrack_expect_list) + return NULL; + } + return e; +} + +static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct list_head *e = v; + + e = e->next; + + if (e == &ip_conntrack_expect_list) + return NULL; + + return e; +} + +static void exp_seq_stop(struct seq_file *s, void *v) +{ + READ_UNLOCK(&ip_conntrack_lock); +} + +static int exp_seq_show(struct seq_file *s, void *v) +{ + struct ip_conntrack_expect *expect = v; + + if (expect->timeout.function) + seq_printf(s, "%ld ", timer_pending(&expect->timeout) + ? (long)(expect->timeout.expires - jiffies)/HZ : 0); + else + seq_printf(s, "- "); + + seq_printf(s, "proto=%u ", expect->tuple.dst.protonum); + + print_tuple(s, &expect->tuple, + ip_ct_find_proto(expect->tuple.dst.protonum)); + return seq_putc(s, '\n'); +} + +static struct seq_operations exp_seq_ops = { + .start = exp_seq_start, + .next = exp_seq_next, + .stop = exp_seq_stop, + .show = exp_seq_show +}; + +static int exp_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &exp_seq_ops); +} + +static struct file_operations exp_file_ops = { + .owner = THIS_MODULE, + .open = exp_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return &per_cpu(ip_conntrack_stat, cpu); + } + + return NULL; +} + +static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + int cpu; + + for (cpu = *pos; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return &per_cpu(ip_conntrack_stat, cpu); + } + + return NULL; +} + +static void ct_cpu_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int ct_cpu_seq_show(struct seq_file *seq, void *v) +{ + unsigned int nr_conntracks = atomic_read(&ip_conntrack_count); + struct ip_conntrack_stat *st = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n"); + return 0; + } + + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " + "%08x %08x %08x %08x %08x %08x %08x %08x \n", + nr_conntracks, + st->searched, + st->found, + st->new, + st->invalid, + st->ignore, + st->delete, + st->delete_list, + st->insert, + st->insert_failed, + st->drop, + st->early_drop, + st->error, + + st->expect_new, + st->expect_create, + st->expect_delete + ); + return 0; +} + +static struct seq_operations ct_cpu_seq_ops = { + .start = ct_cpu_seq_start, + .next = ct_cpu_seq_next, + .stop = ct_cpu_seq_stop, + .show = ct_cpu_seq_show, +}; + +static int ct_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ct_cpu_seq_ops); +} + +static struct file_operations ct_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = ct_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif + +static unsigned int ip_confirm(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + + /* This is where we call the helper: as the packet goes out. */ + ct = ip_conntrack_get(*pskb, &ctinfo); + if (ct && ct->helper) { + unsigned int ret; + ret = ct->helper->help(pskb, ct, ctinfo); + if (ret != NF_ACCEPT) + return ret; + } + + /* We've seen it coming out the other side: confirm it */ + return ip_conntrack_confirm(pskb); +} + +static unsigned int ip_conntrack_defrag(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ +#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE) + /* Previously seen (loopback)? Ignore. Do this before + fragment check. */ + if ((*pskb)->nfct) + return NF_ACCEPT; +#endif + + /* Gather fragments. */ + if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + *pskb = ip_ct_gather_frags(*pskb, + hooknum == NF_IP_PRE_ROUTING ? + IP_DEFRAG_CONNTRACK_IN : + IP_DEFRAG_CONNTRACK_OUT); + if (!*pskb) + return NF_STOLEN; + } + return NF_ACCEPT; +} + +static unsigned int ip_refrag(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct rtable *rt = (struct rtable *)(*pskb)->dst; + + /* We've seen it coming out the other side: confirm */ + if (ip_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT) + return NF_DROP; + + /* Local packets are never produced too large for their + interface. We degfragment them at LOCAL_OUT, however, + so we have to refragment them here. */ + if ((*pskb)->len > dst_mtu(&rt->u.dst) && + !skb_shinfo(*pskb)->tso_size) { + /* No hook can be after us, so this should be OK. */ + ip_fragment(*pskb, okfn); + return NF_STOLEN; + } + return NF_ACCEPT; +} + +static unsigned int ip_conntrack_local(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ipt_hook: happy cracking.\n"); + return NF_ACCEPT; + } + return ip_conntrack_in(hooknum, pskb, in, out, okfn); +} + +/* Connection tracking may drop packets, but never alters them, so + make it the first hook. */ +static struct nf_hook_ops ip_conntrack_defrag_ops = { + .hook = ip_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, +}; + +static struct nf_hook_ops ip_conntrack_in_ops = { + .hook = ip_conntrack_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK, +}; + +static struct nf_hook_ops ip_conntrack_defrag_local_out_ops = { + .hook = ip_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, +}; + +static struct nf_hook_ops ip_conntrack_local_out_ops = { + .hook = ip_conntrack_local, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK, +}; + +/* Refragmenter; last chance. */ +static struct nf_hook_ops ip_conntrack_out_ops = { + .hook = ip_refrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_LAST, +}; + +static struct nf_hook_ops ip_conntrack_local_in_ops = { + .hook = ip_confirm, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_LAST-1, +}; + +/* Sysctl support */ + +#ifdef CONFIG_SYSCTL + +/* From ip_conntrack_core.c */ +extern int ip_conntrack_max; +extern unsigned int ip_conntrack_htable_size; + +/* From ip_conntrack_proto_tcp.c */ +extern unsigned long ip_ct_tcp_timeout_syn_sent; +extern unsigned long ip_ct_tcp_timeout_syn_recv; +extern unsigned long ip_ct_tcp_timeout_established; +extern unsigned long ip_ct_tcp_timeout_fin_wait; +extern unsigned long ip_ct_tcp_timeout_close_wait; +extern unsigned long ip_ct_tcp_timeout_last_ack; +extern unsigned long ip_ct_tcp_timeout_time_wait; +extern unsigned long ip_ct_tcp_timeout_close; +extern unsigned long ip_ct_tcp_timeout_max_retrans; +extern int ip_ct_tcp_loose; +extern int ip_ct_tcp_be_liberal; +extern int ip_ct_tcp_max_retrans; + +/* From ip_conntrack_proto_udp.c */ +extern unsigned long ip_ct_udp_timeout; +extern unsigned long ip_ct_udp_timeout_stream; + +/* From ip_conntrack_proto_icmp.c */ +extern unsigned long ip_ct_icmp_timeout; + +/* From ip_conntrack_proto_icmp.c */ +extern unsigned long ip_ct_generic_timeout; + +/* Log invalid packets of a given protocol */ +static int log_invalid_proto_min = 0; +static int log_invalid_proto_max = 255; + +static struct ctl_table_header *ip_ct_sysctl_header; + +static ctl_table ip_ct_sysctl_table[] = { + { + .ctl_name = NET_IPV4_NF_CONNTRACK_MAX, + .procname = "ip_conntrack_max", + .data = &ip_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT, + .procname = "ip_conntrack_count", + .data = &ip_conntrack_count, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_BUCKETS, + .procname = "ip_conntrack_buckets", + .data = &ip_conntrack_htable_size, + .maxlen = sizeof(unsigned int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, + .procname = "ip_conntrack_tcp_timeout_syn_sent", + .data = &ip_ct_tcp_timeout_syn_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, + .procname = "ip_conntrack_tcp_timeout_syn_recv", + .data = &ip_ct_tcp_timeout_syn_recv, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, + .procname = "ip_conntrack_tcp_timeout_established", + .data = &ip_ct_tcp_timeout_established, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, + .procname = "ip_conntrack_tcp_timeout_fin_wait", + .data = &ip_ct_tcp_timeout_fin_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, + .procname = "ip_conntrack_tcp_timeout_close_wait", + .data = &ip_ct_tcp_timeout_close_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, + .procname = "ip_conntrack_tcp_timeout_last_ack", + .data = &ip_ct_tcp_timeout_last_ack, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, + .procname = "ip_conntrack_tcp_timeout_time_wait", + .data = &ip_ct_tcp_timeout_time_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, + .procname = "ip_conntrack_tcp_timeout_close", + .data = &ip_ct_tcp_timeout_close, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT, + .procname = "ip_conntrack_udp_timeout", + .data = &ip_ct_udp_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM, + .procname = "ip_conntrack_udp_timeout_stream", + .data = &ip_ct_udp_timeout_stream, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT, + .procname = "ip_conntrack_icmp_timeout", + .data = &ip_ct_icmp_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT, + .procname = "ip_conntrack_generic_timeout", + .data = &ip_ct_generic_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID, + .procname = "ip_conntrack_log_invalid", + .data = &ip_ct_log_invalid, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &log_invalid_proto_min, + .extra2 = &log_invalid_proto_max, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, + .procname = "ip_conntrack_tcp_timeout_max_retrans", + .data = &ip_ct_tcp_timeout_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_LOOSE, + .procname = "ip_conntrack_tcp_loose", + .data = &ip_ct_tcp_loose, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, + .procname = "ip_conntrack_tcp_be_liberal", + .data = &ip_ct_tcp_be_liberal, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, + .procname = "ip_conntrack_tcp_max_retrans", + .data = &ip_ct_tcp_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +#define NET_IP_CONNTRACK_MAX 2089 + +static ctl_table ip_ct_netfilter_table[] = { + { + .ctl_name = NET_IPV4_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = ip_ct_sysctl_table, + }, + { + .ctl_name = NET_IP_CONNTRACK_MAX, + .procname = "ip_conntrack_max", + .data = &ip_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = 0 } +}; + +static ctl_table ip_ct_ipv4_table[] = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = ip_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table ip_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ip_ct_ipv4_table, + }, + { .ctl_name = 0 } +}; + +EXPORT_SYMBOL(ip_ct_log_invalid); +#endif /* CONFIG_SYSCTL */ + +static int init_or_cleanup(int init) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc, *proc_exp, *proc_stat; +#endif + int ret = 0; + + if (!init) goto cleanup; + + ret = ip_conntrack_init(); + if (ret < 0) + goto cleanup_nothing; + +#ifdef CONFIG_PROC_FS + ret = -ENOMEM; + proc = proc_net_fops_create("ip_conntrack", 0440, &ct_file_ops); + if (!proc) goto cleanup_init; + + proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440, + &exp_file_ops); + if (!proc_exp) goto cleanup_proc; + + proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat); + if (!proc_stat) + goto cleanup_proc_exp; + + proc_stat->proc_fops = &ct_cpu_seq_fops; + proc_stat->owner = THIS_MODULE; +#endif + + ret = nf_register_hook(&ip_conntrack_defrag_ops); + if (ret < 0) { + printk("ip_conntrack: can't register pre-routing defrag hook.\n"); + goto cleanup_proc_stat; + } + ret = nf_register_hook(&ip_conntrack_defrag_local_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register local_out defrag hook.\n"); + goto cleanup_defragops; + } + ret = nf_register_hook(&ip_conntrack_in_ops); + if (ret < 0) { + printk("ip_conntrack: can't register pre-routing hook.\n"); + goto cleanup_defraglocalops; + } + ret = nf_register_hook(&ip_conntrack_local_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register local out hook.\n"); + goto cleanup_inops; + } + ret = nf_register_hook(&ip_conntrack_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register post-routing hook.\n"); + goto cleanup_inandlocalops; + } + ret = nf_register_hook(&ip_conntrack_local_in_ops); + if (ret < 0) { + printk("ip_conntrack: can't register local in hook.\n"); + goto cleanup_inoutandlocalops; + } +#ifdef CONFIG_SYSCTL + ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0); + if (ip_ct_sysctl_header == NULL) { + printk("ip_conntrack: can't register to sysctl.\n"); + ret = -ENOMEM; + goto cleanup_localinops; + } +#endif + + return ret; + + cleanup: +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(ip_ct_sysctl_header); + cleanup_localinops: +#endif + nf_unregister_hook(&ip_conntrack_local_in_ops); + cleanup_inoutandlocalops: + nf_unregister_hook(&ip_conntrack_out_ops); + cleanup_inandlocalops: + nf_unregister_hook(&ip_conntrack_local_out_ops); + cleanup_inops: + nf_unregister_hook(&ip_conntrack_in_ops); + cleanup_defraglocalops: + nf_unregister_hook(&ip_conntrack_defrag_local_out_ops); + cleanup_defragops: + nf_unregister_hook(&ip_conntrack_defrag_ops); + cleanup_proc_stat: +#ifdef CONFIG_PROC_FS + remove_proc_entry("ip_conntrack", proc_net_stat); + cleanup_proc_exp: + proc_net_remove("ip_conntrack_expect"); + cleanup_proc: + proc_net_remove("ip_conntrack"); + cleanup_init: +#endif /* CONFIG_PROC_FS */ + ip_conntrack_cleanup(); + cleanup_nothing: + return ret; +} + +/* FIXME: Allow NULL functions and sub in pointers to generic for + them. --RR */ +int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto) +{ + int ret = 0; + + WRITE_LOCK(&ip_conntrack_lock); + if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { + ret = -EBUSY; + goto out; + } + ip_ct_protos[proto->proto] = proto; + out: + WRITE_UNLOCK(&ip_conntrack_lock); + return ret; +} + +void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto) +{ + WRITE_LOCK(&ip_conntrack_lock); + ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol; + WRITE_UNLOCK(&ip_conntrack_lock); + + /* Somebody could be still looking at the proto in bh. */ + synchronize_net(); + + /* Remove all contrack entries for this protocol */ + ip_ct_iterate_cleanup(kill_proto, &proto->proto); +} + +static int __init init(void) +{ + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); + +/* Some modules need us, but don't depend directly on any symbol. + They should call this. */ +void need_ip_conntrack(void) +{ +} + +EXPORT_SYMBOL(ip_conntrack_protocol_register); +EXPORT_SYMBOL(ip_conntrack_protocol_unregister); +EXPORT_SYMBOL(ip_ct_get_tuple); +EXPORT_SYMBOL(invert_tuplepr); +EXPORT_SYMBOL(ip_conntrack_alter_reply); +EXPORT_SYMBOL(ip_conntrack_destroyed); +EXPORT_SYMBOL(need_ip_conntrack); +EXPORT_SYMBOL(ip_conntrack_helper_register); +EXPORT_SYMBOL(ip_conntrack_helper_unregister); +EXPORT_SYMBOL(ip_ct_iterate_cleanup); +EXPORT_SYMBOL(ip_ct_refresh_acct); +EXPORT_SYMBOL(ip_ct_protos); +EXPORT_SYMBOL(ip_ct_find_proto); +EXPORT_SYMBOL(ip_conntrack_expect_alloc); +EXPORT_SYMBOL(ip_conntrack_expect_free); +EXPORT_SYMBOL(ip_conntrack_expect_related); +EXPORT_SYMBOL(ip_conntrack_unexpect_related); +EXPORT_SYMBOL(ip_conntrack_tuple_taken); +EXPORT_SYMBOL(ip_ct_gather_frags); +EXPORT_SYMBOL(ip_conntrack_htable_size); +EXPORT_SYMBOL(ip_conntrack_lock); +EXPORT_SYMBOL(ip_conntrack_hash); +EXPORT_SYMBOL(ip_conntrack_untracked); +EXPORT_SYMBOL_GPL(ip_conntrack_find_get); +EXPORT_SYMBOL_GPL(ip_conntrack_put); +#ifdef CONFIG_IP_NF_NAT_NEEDED +EXPORT_SYMBOL(ip_conntrack_tcp_update); +#endif diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c new file mode 100644 index 000000000000..992fac3e36ee --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_tftp.c @@ -0,0 +1,159 @@ +/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Version: 0.0.7 + * + * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org> + * - port to newnat API + * + */ + +#include <linux/module.h> +#include <linux/ip.h> +#include <linux/udp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_tftp.h> +#include <linux/moduleparam.h> + +MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); +MODULE_DESCRIPTION("tftp connection tracking helper"); +MODULE_LICENSE("GPL"); + +#define MAX_PORTS 8 +static int ports[MAX_PORTS]; +static int ports_c; +module_param_array(ports, int, &ports_c, 0400); +MODULE_PARM_DESC(ports, "port numbers of tftp servers"); + +#if 0 +#define DEBUGP(format, args...) printk("%s:%s:" format, \ + __FILE__, __FUNCTION__ , ## args) +#else +#define DEBUGP(format, args...) +#endif + +unsigned int (*ip_nat_tftp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack_expect *exp); +EXPORT_SYMBOL_GPL(ip_nat_tftp_hook); + +static int tftp_help(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + struct tftphdr _tftph, *tfh; + struct ip_conntrack_expect *exp; + unsigned int ret = NF_ACCEPT; + + tfh = skb_header_pointer(*pskb, + (*pskb)->nh.iph->ihl*4+sizeof(struct udphdr), + sizeof(_tftph), &_tftph); + if (tfh == NULL) + return NF_ACCEPT; + + switch (ntohs(tfh->opcode)) { + /* RRQ and WRQ works the same way */ + case TFTP_OPCODE_READ: + case TFTP_OPCODE_WRITE: + DEBUGP(""); + DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + exp = ip_conntrack_expect_alloc(); + if (exp == NULL) + return NF_DROP; + + exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + exp->mask.src.ip = 0xffffffff; + exp->mask.dst.ip = 0xffffffff; + exp->mask.dst.u.udp.port = 0xffff; + exp->mask.dst.protonum = 0xff; + exp->expectfn = NULL; + exp->master = ct; + + DEBUGP("expect: "); + DUMP_TUPLE(&exp->tuple); + DUMP_TUPLE(&exp->mask); + if (ip_nat_tftp_hook) + ret = ip_nat_tftp_hook(pskb, ctinfo, exp); + else if (ip_conntrack_expect_related(exp) != 0) { + ip_conntrack_expect_free(exp); + ret = NF_DROP; + } + break; + case TFTP_OPCODE_DATA: + case TFTP_OPCODE_ACK: + DEBUGP("Data/ACK opcode\n"); + break; + case TFTP_OPCODE_ERROR: + DEBUGP("Error opcode\n"); + break; + default: + DEBUGP("Unknown opcode\n"); + } + return NF_ACCEPT; +} + +static struct ip_conntrack_helper tftp[MAX_PORTS]; +static char tftp_names[MAX_PORTS][10]; + +static void fini(void) +{ + int i; + + for (i = 0 ; i < ports_c; i++) { + DEBUGP("unregistering helper for port %d\n", + ports[i]); + ip_conntrack_helper_unregister(&tftp[i]); + } +} + +static int __init init(void) +{ + int i, ret; + char *tmpname; + + if (ports_c == 0) + ports[ports_c++] = TFTP_PORT; + + for (i = 0; i < ports_c; i++) { + /* Create helper structure */ + memset(&tftp[i], 0, sizeof(struct ip_conntrack_helper)); + + tftp[i].tuple.dst.protonum = IPPROTO_UDP; + tftp[i].tuple.src.u.udp.port = htons(ports[i]); + tftp[i].mask.dst.protonum = 0xFF; + tftp[i].mask.src.u.udp.port = 0xFFFF; + tftp[i].max_expected = 1; + tftp[i].timeout = 5 * 60; /* 5 minutes */ + tftp[i].me = THIS_MODULE; + tftp[i].help = tftp_help; + + tmpname = &tftp_names[i][0]; + if (ports[i] == TFTP_PORT) + sprintf(tmpname, "tftp"); + else + sprintf(tmpname, "tftp-%d", i); + tftp[i].name = tmpname; + + DEBUGP("port #%d: %d\n", i, ports[i]); + + ret=ip_conntrack_helper_register(&tftp[i]); + if (ret) { + printk("ERROR registering helper for port %d\n", + ports[i]); + fini(); + return(ret); + } + } + return(0); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_amanda.c b/net/ipv4/netfilter/ip_nat_amanda.c new file mode 100644 index 000000000000..da1f412583ed --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_amanda.c @@ -0,0 +1,88 @@ +/* Amanda extension for TCP NAT alteration. + * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca> + * based on a copy of HW's ip_nat_irc.c as well as other modules + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Module load syntax: + * insmod ip_nat_amanda.o + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <net/tcp.h> +#include <net/udp.h> + +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_amanda.h> + + +MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); +MODULE_DESCRIPTION("Amanda NAT helper"); +MODULE_LICENSE("GPL"); + +static unsigned int help(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp) +{ + char buffer[sizeof("65535")]; + u_int16_t port; + unsigned int ret; + + /* Connection comes from client. */ + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->dir = IP_CT_DIR_ORIGINAL; + + /* When you see the packet, we need to NAT it the same as the + * this one (ie. same IP: it will be TCP and master is UDP). */ + exp->expectfn = ip_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + exp->tuple.dst.u.tcp.port = htons(port); + if (ip_conntrack_expect_related(exp) == 0) + break; + } + + if (port == 0) { + ip_conntrack_expect_free(exp); + return NF_DROP; + } + + sprintf(buffer, "%u", port); + ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo, + matchoff, matchlen, + buffer, strlen(buffer)); + if (ret != NF_ACCEPT) + ip_conntrack_unexpect_related(exp); + return ret; +} + +static void __exit fini(void) +{ + ip_nat_amanda_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); +} + +static int __init init(void) +{ + BUG_ON(ip_nat_amanda_hook); + ip_nat_amanda_hook = help; + return 0; +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c new file mode 100644 index 000000000000..162ceacfc29a --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -0,0 +1,556 @@ +/* NAT for netfilter; shared with compatibility layer. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv4.h> +#include <linux/vmalloc.h> +#include <net/checksum.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/tcp.h> /* For tcp_prot in getorigdst */ +#include <linux/icmp.h> +#include <linux/udp.h> +#include <linux/jhash.h> + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) + +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> +#include <linux/netfilter_ipv4/ip_conntrack_protocol.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DECLARE_RWLOCK(ip_nat_lock); + +/* Calculated at init based on memory size */ +static unsigned int ip_nat_htable_size; + +static struct list_head *bysource; +struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; + + +/* We keep an extra hash for each conntrack, for fast searching. */ +static inline unsigned int +hash_by_src(const struct ip_conntrack_tuple *tuple) +{ + /* Original src, to ensure we map it consistently if poss. */ + return jhash_3words(tuple->src.ip, tuple->src.u.all, + tuple->dst.protonum, 0) % ip_nat_htable_size; +} + +/* Noone using conntrack by the time this called. */ +static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn) +{ + if (!(conn->status & IPS_NAT_DONE_MASK)) + return; + + WRITE_LOCK(&ip_nat_lock); + list_del(&conn->nat.info.bysource); + WRITE_UNLOCK(&ip_nat_lock); +} + +/* We do checksum mangling, so if they were wrong before they're still + * wrong. Also works for incomplete packets (eg. ICMP dest + * unreachables.) */ +u_int16_t +ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck) +{ + u_int32_t diffs[] = { oldvalinv, newval }; + return csum_fold(csum_partial((char *)diffs, sizeof(diffs), + oldcheck^0xFFFF)); +} + +/* Is this tuple already taken? (not by us) */ +int +ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + /* Conntrack tracking doesn't keep track of outgoing tuples; only + incoming ones. NAT means they don't have a fixed mapping, + so we invert the tuple and look for the incoming reply. + + We could keep a separate hash if this proves too slow. */ + struct ip_conntrack_tuple reply; + + invert_tuplepr(&reply, tuple); + return ip_conntrack_tuple_taken(&reply, ignored_conntrack); +} + +/* If we source map this tuple so reply looks like reply_tuple, will + * that meet the constraints of range. */ +static int +in_range(const struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range) +{ + struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum); + + /* If we are supposed to map IPs, then we must be in the + range specified, otherwise let this drag us onto a new src IP. */ + if (range->flags & IP_NAT_RANGE_MAP_IPS) { + if (ntohl(tuple->src.ip) < ntohl(range->min_ip) + || ntohl(tuple->src.ip) > ntohl(range->max_ip)) + return 0; + } + + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) + || proto->in_range(tuple, IP_NAT_MANIP_SRC, + &range->min, &range->max)) + return 1; + + return 0; +} + +static inline int +same_src(const struct ip_conntrack *ct, + const struct ip_conntrack_tuple *tuple) +{ + return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum + == tuple->dst.protonum + && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip + == tuple->src.ip + && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all + == tuple->src.u.all); +} + +/* Only called for SRC manip */ +static int +find_appropriate_src(const struct ip_conntrack_tuple *tuple, + struct ip_conntrack_tuple *result, + const struct ip_nat_range *range) +{ + unsigned int h = hash_by_src(tuple); + struct ip_conntrack *ct; + + READ_LOCK(&ip_nat_lock); + list_for_each_entry(ct, &bysource[h], nat.info.bysource) { + if (same_src(ct, tuple)) { + /* Copy source part from reply tuple. */ + invert_tuplepr(result, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + result->dst = tuple->dst; + + if (in_range(result, range)) { + READ_UNLOCK(&ip_nat_lock); + return 1; + } + } + } + READ_UNLOCK(&ip_nat_lock); + return 0; +} + +/* For [FUTURE] fragmentation handling, we want the least-used + src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus + if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports + 1-65535, we don't do pro-rata allocation based on ports; we choose + the ip with the lowest src-ip/dst-ip/proto usage. +*/ +static void +find_best_ips_proto(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + const struct ip_conntrack *conntrack, + enum ip_nat_manip_type maniptype) +{ + u_int32_t *var_ipp; + /* Host order */ + u_int32_t minip, maxip, j; + + /* No IP mapping? Do nothing. */ + if (!(range->flags & IP_NAT_RANGE_MAP_IPS)) + return; + + if (maniptype == IP_NAT_MANIP_SRC) + var_ipp = &tuple->src.ip; + else + var_ipp = &tuple->dst.ip; + + /* Fast path: only one choice. */ + if (range->min_ip == range->max_ip) { + *var_ipp = range->min_ip; + return; + } + + /* Hashing source and destination IPs gives a fairly even + * spread in practice (if there are a small number of IPs + * involved, there usually aren't that many connections + * anyway). The consistency means that servers see the same + * client coming from the same IP (some Internet Banking sites + * like this), even across reboots. */ + minip = ntohl(range->min_ip); + maxip = ntohl(range->max_ip); + j = jhash_2words(tuple->src.ip, tuple->dst.ip, 0); + *var_ipp = htonl(minip + j % (maxip - minip + 1)); +} + +/* Manipulate the tuple into the range given. For NF_IP_POST_ROUTING, + * we change the source to map into the range. For NF_IP_PRE_ROUTING + * and NF_IP_LOCAL_OUT, we change the destination to map into the + * range. It might not be possible to get a unique tuple, but we try. + * At worst (or if we race), we will end up with a final duplicate in + * __ip_conntrack_confirm and drop the packet. */ +static void +get_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig_tuple, + const struct ip_nat_range *range, + struct ip_conntrack *conntrack, + enum ip_nat_manip_type maniptype) +{ + struct ip_nat_protocol *proto + = ip_nat_find_proto(orig_tuple->dst.protonum); + + /* 1) If this srcip/proto/src-proto-part is currently mapped, + and that same mapping gives a unique tuple within the given + range, use that. + + This is only required for source (ie. NAT/masq) mappings. + So far, we don't do local source mappings, so multiple + manips not an issue. */ + if (maniptype == IP_NAT_MANIP_SRC) { + if (find_appropriate_src(orig_tuple, tuple, range)) { + DEBUGP("get_unique_tuple: Found current src map\n"); + if (!ip_nat_used_tuple(tuple, conntrack)) + return; + } + } + + /* 2) Select the least-used IP/proto combination in the given + range. */ + *tuple = *orig_tuple; + find_best_ips_proto(tuple, range, conntrack, maniptype); + + /* 3) The per-protocol part of the manip is made to map into + the range to make a unique tuple. */ + + /* Only bother mapping if it's not already in range and unique */ + if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) + || proto->in_range(tuple, maniptype, &range->min, &range->max)) + && !ip_nat_used_tuple(tuple, conntrack)) + return; + + /* Last change: get protocol to try to obtain unique tuple. */ + proto->unique_tuple(tuple, range, maniptype, conntrack); +} + +unsigned int +ip_nat_setup_info(struct ip_conntrack *conntrack, + const struct ip_nat_range *range, + unsigned int hooknum) +{ + struct ip_conntrack_tuple curr_tuple, new_tuple; + struct ip_nat_info *info = &conntrack->nat.info; + int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK); + enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum); + + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + || hooknum == NF_IP_POST_ROUTING + || hooknum == NF_IP_LOCAL_IN + || hooknum == NF_IP_LOCAL_OUT); + BUG_ON(ip_nat_initialized(conntrack, maniptype)); + + /* What we've got will look like inverse of reply. Normally + this is what is in the conntrack, except for prior + manipulations (future optimization: if num_manips == 0, + orig_tp = + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ + invert_tuplepr(&curr_tuple, + &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple); + + get_unique_tuple(&new_tuple, &curr_tuple, range, conntrack, maniptype); + + if (!ip_ct_tuple_equal(&new_tuple, &curr_tuple)) { + struct ip_conntrack_tuple reply; + + /* Alter conntrack table so will recognize replies. */ + invert_tuplepr(&reply, &new_tuple); + ip_conntrack_alter_reply(conntrack, &reply); + + /* Non-atomic: we own this at the moment. */ + if (maniptype == IP_NAT_MANIP_SRC) + conntrack->status |= IPS_SRC_NAT; + else + conntrack->status |= IPS_DST_NAT; + } + + /* Place in source hash if this is the first time. */ + if (have_to_hash) { + unsigned int srchash + = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple); + WRITE_LOCK(&ip_nat_lock); + list_add(&info->bysource, &bysource[srchash]); + WRITE_UNLOCK(&ip_nat_lock); + } + + /* It's done. */ + if (maniptype == IP_NAT_MANIP_DST) + set_bit(IPS_DST_NAT_DONE_BIT, &conntrack->status); + else + set_bit(IPS_SRC_NAT_DONE_BIT, &conntrack->status); + + return NF_ACCEPT; +} + +/* Returns true if succeeded. */ +static int +manip_pkt(u_int16_t proto, + struct sk_buff **pskb, + unsigned int iphdroff, + const struct ip_conntrack_tuple *target, + enum ip_nat_manip_type maniptype) +{ + struct iphdr *iph; + + (*pskb)->nfcache |= NFC_ALTERED; + if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph))) + return 0; + + iph = (void *)(*pskb)->data + iphdroff; + + /* Manipulate protcol part. */ + if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff, + target, maniptype)) + return 0; + + iph = (void *)(*pskb)->data + iphdroff; + + if (maniptype == IP_NAT_MANIP_SRC) { + iph->check = ip_nat_cheat_check(~iph->saddr, target->src.ip, + iph->check); + iph->saddr = target->src.ip; + } else { + iph->check = ip_nat_cheat_check(~iph->daddr, target->dst.ip, + iph->check); + iph->daddr = target->dst.ip; + } + return 1; +} + +/* Do packet manipulations according to ip_nat_setup_info. */ +unsigned int nat_packet(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + unsigned int hooknum, + struct sk_buff **pskb) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + unsigned long statusbit; + enum ip_nat_manip_type mtype = HOOK2MANIP(hooknum); + + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) + && (hooknum == NF_IP_POST_ROUTING || hooknum == NF_IP_LOCAL_IN)) { + DEBUGP("ip_nat_core: adjusting sequence number\n"); + /* future: put this in a l4-proto specific function, + * and call this function here. */ + if (!ip_nat_seq_adjust(pskb, ct, ctinfo)) + return NF_DROP; + } + + if (mtype == IP_NAT_MANIP_SRC) + statusbit = IPS_SRC_NAT; + else + statusbit = IPS_DST_NAT; + + /* Invert if this is reply dir. */ + if (dir == IP_CT_DIR_REPLY) + statusbit ^= IPS_NAT_MASK; + + /* Non-atomic: these bits don't change. */ + if (ct->status & statusbit) { + struct ip_conntrack_tuple target; + + /* We are aiming to look like inverse of other direction. */ + invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + + if (!manip_pkt(target.dst.protonum, pskb, 0, &target, mtype)) + return NF_DROP; + } + return NF_ACCEPT; +} + +/* Dir is direction ICMP is coming from (opposite to packet it contains) */ +int icmp_reply_translation(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_nat_manip_type manip, + enum ip_conntrack_dir dir) +{ + struct { + struct icmphdr icmp; + struct iphdr ip; + } *inside; + struct ip_conntrack_tuple inner, target; + int hdrlen = (*pskb)->nh.iph->ihl * 4; + + if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside))) + return 0; + + inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; + + /* We're actually going to mangle it beyond trivial checksum + adjustment, so make sure the current checksum is correct. */ + if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) { + hdrlen = (*pskb)->nh.iph->ihl * 4; + if ((u16)csum_fold(skb_checksum(*pskb, hdrlen, + (*pskb)->len - hdrlen, 0))) + return 0; + } + + /* Must be RELATED */ + IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED || + (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY); + + /* Redirects on non-null nats must be dropped, else they'll + start talking to each other without our translation, and be + confused... --RR */ + if (inside->icmp.type == ICMP_REDIRECT) { + /* If NAT isn't finished, assume it and drop. */ + if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) + return 0; + + if (ct->status & IPS_NAT_MASK) + return 0; + } + + DEBUGP("icmp_reply_translation: translating error %p manp %u dir %s\n", + *pskb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); + + if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 + + sizeof(struct icmphdr) + inside->ip.ihl*4, + &inner, ip_ct_find_proto(inside->ip.protocol))) + return 0; + + /* Change inner back to look like incoming packet. We do the + opposite manip on this hook to normal, because it might not + pass all hooks (locally-generated ICMP). Consider incoming + packet: PREROUTING (DST manip), routing produces ICMP, goes + through POSTROUTING (which must correct the DST manip). */ + if (!manip_pkt(inside->ip.protocol, pskb, + (*pskb)->nh.iph->ihl*4 + + sizeof(inside->icmp), + &ct->tuplehash[!dir].tuple, + !manip)) + return 0; + + /* Reloading "inside" here since manip_pkt inner. */ + inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; + inside->icmp.checksum = 0; + inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen, + (*pskb)->len - hdrlen, + 0)); + + /* Change outer to look the reply to an incoming packet + * (proto 0 means don't invert per-proto part). */ + + /* Obviously, we need to NAT destination IP, but source IP + should be NAT'ed only if it is from a NAT'd host. + + Explanation: some people use NAT for anonymizing. Also, + CERT recommends dropping all packets from private IP + addresses (although ICMP errors from internal links with + such addresses are not too uncommon, as Alan Cox points + out) */ + if (manip != IP_NAT_MANIP_SRC + || ((*pskb)->nh.iph->saddr == ct->tuplehash[dir].tuple.src.ip)) { + invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + if (!manip_pkt(0, pskb, 0, &target, manip)) + return 0; + } + + return 1; +} + +/* Protocol registration. */ +int ip_nat_protocol_register(struct ip_nat_protocol *proto) +{ + int ret = 0; + + WRITE_LOCK(&ip_nat_lock); + if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { + ret = -EBUSY; + goto out; + } + ip_nat_protos[proto->protonum] = proto; + out: + WRITE_UNLOCK(&ip_nat_lock); + return ret; +} + +/* Noone stores the protocol anywhere; simply delete it. */ +void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) +{ + WRITE_LOCK(&ip_nat_lock); + ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol; + WRITE_UNLOCK(&ip_nat_lock); + + /* Someone could be still looking at the proto in a bh. */ + synchronize_net(); +} + +int __init ip_nat_init(void) +{ + size_t i; + + /* Leave them the same for the moment. */ + ip_nat_htable_size = ip_conntrack_htable_size; + + /* One vmalloc for both hash tables */ + bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size); + if (!bysource) + return -ENOMEM; + + /* Sew in builtin protocols. */ + WRITE_LOCK(&ip_nat_lock); + for (i = 0; i < MAX_IP_NAT_PROTO; i++) + ip_nat_protos[i] = &ip_nat_unknown_protocol; + ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp; + ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp; + ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp; + WRITE_UNLOCK(&ip_nat_lock); + + for (i = 0; i < ip_nat_htable_size; i++) { + INIT_LIST_HEAD(&bysource[i]); + } + + /* FIXME: Man, this is a hack. <SIGH> */ + IP_NF_ASSERT(ip_conntrack_destroyed == NULL); + ip_conntrack_destroyed = &ip_nat_cleanup_conntrack; + + /* Initialize fake conntrack so that NAT will skip it */ + ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK; + return 0; +} + +/* Clear NAT section of all conntracks, in case we're loaded again. */ +static int clean_nat(struct ip_conntrack *i, void *data) +{ + memset(&i->nat, 0, sizeof(i->nat)); + i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST); + return 0; +} + +/* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */ +void ip_nat_cleanup(void) +{ + ip_ct_iterate_cleanup(&clean_nat, NULL); + ip_conntrack_destroyed = NULL; + vfree(bysource); +} diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c new file mode 100644 index 000000000000..c6000e794ad6 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_ftp.c @@ -0,0 +1,183 @@ +/* FTP extension for TCP NAT alteration. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/netfilter_ipv4.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/moduleparam.h> +#include <net/tcp.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_conntrack_ftp.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); +MODULE_DESCRIPTION("ftp NAT helper"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* FIXME: Time out? --RR */ + +static int +mangle_rfc959_packet(struct sk_buff **pskb, + u_int32_t newip, + u_int16_t port, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + u32 *seq) +{ + char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")]; + + sprintf(buffer, "%u,%u,%u,%u,%u,%u", + NIPQUAD(newip), port>>8, port&0xFF); + + DEBUGP("calling ip_nat_mangle_tcp_packet\n"); + + *seq += strlen(buffer) - matchlen; + return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, + matchlen, buffer, strlen(buffer)); +} + +/* |1|132.235.1.2|6275| */ +static int +mangle_eprt_packet(struct sk_buff **pskb, + u_int32_t newip, + u_int16_t port, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + u32 *seq) +{ + char buffer[sizeof("|1|255.255.255.255|65535|")]; + + sprintf(buffer, "|1|%u.%u.%u.%u|%u|", NIPQUAD(newip), port); + + DEBUGP("calling ip_nat_mangle_tcp_packet\n"); + + *seq += strlen(buffer) - matchlen; + return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, + matchlen, buffer, strlen(buffer)); +} + +/* |1|132.235.1.2|6275| */ +static int +mangle_epsv_packet(struct sk_buff **pskb, + u_int32_t newip, + u_int16_t port, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + u32 *seq) +{ + char buffer[sizeof("|||65535|")]; + + sprintf(buffer, "|||%u|", port); + + DEBUGP("calling ip_nat_mangle_tcp_packet\n"); + + *seq += strlen(buffer) - matchlen; + return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, + matchlen, buffer, strlen(buffer)); +} + +static int (*mangle[])(struct sk_buff **, u_int32_t, u_int16_t, + unsigned int, + unsigned int, + struct ip_conntrack *, + enum ip_conntrack_info, + u32 *seq) += { [IP_CT_FTP_PORT] = mangle_rfc959_packet, + [IP_CT_FTP_PASV] = mangle_rfc959_packet, + [IP_CT_FTP_EPRT] = mangle_eprt_packet, + [IP_CT_FTP_EPSV] = mangle_epsv_packet +}; + +/* So, this packet has hit the connection tracking matching code. + Mangle it, and change the expectation to match the new version. */ +static unsigned int ip_nat_ftp(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + enum ip_ct_ftp_type type, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp, + u32 *seq) +{ + u_int32_t newip; + u_int16_t port; + int dir = CTINFO2DIR(ctinfo); + struct ip_conntrack *ct = exp->master; + + DEBUGP("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); + + /* Connection will come from wherever this packet goes, hence !dir */ + newip = ct->tuplehash[!dir].tuple.dst.ip; + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->dir = !dir; + + /* When you see the packet, we need to NAT it the same as the + * this one. */ + exp->expectfn = ip_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + exp->tuple.dst.u.tcp.port = htons(port); + if (ip_conntrack_expect_related(exp) == 0) + break; + } + + if (port == 0) { + ip_conntrack_expect_free(exp); + return NF_DROP; + } + + if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo, + seq)) { + ip_conntrack_unexpect_related(exp); + return NF_DROP; + } + return NF_ACCEPT; +} + +static void __exit fini(void) +{ + ip_nat_ftp_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); +} + +static int __init init(void) +{ + BUG_ON(ip_nat_ftp_hook); + ip_nat_ftp_hook = ip_nat_ftp; + return 0; +} + +/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ +static int warn_set(const char *val, struct kernel_param *kp) +{ + printk(KERN_INFO __stringify(KBUILD_MODNAME) + ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); + return 0; +} +module_param_call(ports, warn_set, NULL, NULL, 0); + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c new file mode 100644 index 000000000000..1637b96d8c01 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_helper.c @@ -0,0 +1,430 @@ +/* ip_nat_helper.c - generic support functions for NAT helpers + * + * (C) 2000-2002 Harald Welte <laforge@netfilter.org> + * (C) 2003-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 14 Jan 2002 Harald Welte <laforge@gnumonks.org>: + * - add support for SACK adjustment + * 14 Mar 2002 Harald Welte <laforge@gnumonks.org>: + * - merge SACK support into newnat API + * 16 Aug 2002 Brian J. Murrell <netfilter@interlinx.bc.ca>: + * - make ip_nat_resize_packet more generic (TCP and UDP) + * - add ip_nat_mangle_udp_packet + */ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kmod.h> +#include <linux/types.h> +#include <linux/timer.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv4.h> +#include <net/checksum.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/tcp.h> +#include <net/udp.h> + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) + +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#define DUMP_OFFSET(x) printk("offset_before=%d, offset_after=%d, correction_pos=%u\n", x->offset_before, x->offset_after, x->correction_pos); +#else +#define DEBUGP(format, args...) +#define DUMP_OFFSET(x) +#endif + +static DECLARE_LOCK(ip_nat_seqofs_lock); + +/* Setup TCP sequence correction given this change at this sequence */ +static inline void +adjust_tcp_sequence(u32 seq, + int sizediff, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + int dir; + struct ip_nat_seq *this_way, *other_way; + + DEBUGP("ip_nat_resize_packet: old_size = %u, new_size = %u\n", + (*skb)->len, new_size); + + dir = CTINFO2DIR(ctinfo); + + this_way = &ct->nat.info.seq[dir]; + other_way = &ct->nat.info.seq[!dir]; + + DEBUGP("ip_nat_resize_packet: Seq_offset before: "); + DUMP_OFFSET(this_way); + + LOCK_BH(&ip_nat_seqofs_lock); + + /* SYN adjust. If it's uninitialized, or this is after last + * correction, record it: we don't handle more than one + * adjustment in the window, but do deal with common case of a + * retransmit */ + if (this_way->offset_before == this_way->offset_after + || before(this_way->correction_pos, seq)) { + this_way->correction_pos = seq; + this_way->offset_before = this_way->offset_after; + this_way->offset_after += sizediff; + } + UNLOCK_BH(&ip_nat_seqofs_lock); + + DEBUGP("ip_nat_resize_packet: Seq_offset after: "); + DUMP_OFFSET(this_way); +} + +/* Frobs data inside this packet, which is linear. */ +static void mangle_contents(struct sk_buff *skb, + unsigned int dataoff, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len) +{ + unsigned char *data; + + BUG_ON(skb_is_nonlinear(skb)); + data = (unsigned char *)skb->nh.iph + dataoff; + + /* move post-replacement */ + memmove(data + match_offset + rep_len, + data + match_offset + match_len, + skb->tail - (data + match_offset + match_len)); + + /* insert data from buffer */ + memcpy(data + match_offset, rep_buffer, rep_len); + + /* update skb info */ + if (rep_len > match_len) { + DEBUGP("ip_nat_mangle_packet: Extending packet by " + "%u from %u bytes\n", rep_len - match_len, + skb->len); + skb_put(skb, rep_len - match_len); + } else { + DEBUGP("ip_nat_mangle_packet: Shrinking packet from " + "%u from %u bytes\n", match_len - rep_len, + skb->len); + __skb_trim(skb, skb->len + rep_len - match_len); + } + + /* fix IP hdr checksum information */ + skb->nh.iph->tot_len = htons(skb->len); + ip_send_check(skb->nh.iph); +} + +/* Unusual, but possible case. */ +static int enlarge_skb(struct sk_buff **pskb, unsigned int extra) +{ + struct sk_buff *nskb; + + if ((*pskb)->len + extra > 65535) + return 0; + + nskb = skb_copy_expand(*pskb, skb_headroom(*pskb), extra, GFP_ATOMIC); + if (!nskb) + return 0; + + /* Transfer socket to new skb. */ + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); +#ifdef CONFIG_NETFILTER_DEBUG + nskb->nf_debug = (*pskb)->nf_debug; +#endif + kfree_skb(*pskb); + *pskb = nskb; + return 1; +} + +/* Generic function for mangling variable-length address changes inside + * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX + * command in FTP). + * + * Takes care about all the nasty sequence number changes, checksumming, + * skb enlargement, ... + * + * */ +int +ip_nat_mangle_tcp_packet(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len) +{ + struct iphdr *iph; + struct tcphdr *tcph; + int datalen; + + if (!skb_ip_make_writable(pskb, (*pskb)->len)) + return 0; + + if (rep_len > match_len + && rep_len - match_len > skb_tailroom(*pskb) + && !enlarge_skb(pskb, rep_len - match_len)) + return 0; + + SKB_LINEAR_ASSERT(*pskb); + + iph = (*pskb)->nh.iph; + tcph = (void *)iph + iph->ihl*4; + + mangle_contents(*pskb, iph->ihl*4 + tcph->doff*4, + match_offset, match_len, rep_buffer, rep_len); + + datalen = (*pskb)->len - iph->ihl*4; + tcph->check = 0; + tcph->check = tcp_v4_check(tcph, datalen, iph->saddr, iph->daddr, + csum_partial((char *)tcph, datalen, 0)); + + if (rep_len != match_len) { + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); + adjust_tcp_sequence(ntohl(tcph->seq), + (int)rep_len - (int)match_len, + ct, ctinfo); + /* Tell TCP window tracking about seq change */ + ip_conntrack_tcp_update(*pskb, ct, CTINFO2DIR(ctinfo)); + } + return 1; +} + +/* Generic function for mangling variable-length address changes inside + * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX + * command in the Amanda protocol) + * + * Takes care about all the nasty sequence number changes, checksumming, + * skb enlargement, ... + * + * XXX - This function could be merged with ip_nat_mangle_tcp_packet which + * should be fairly easy to do. + */ +int +ip_nat_mangle_udp_packet(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len) +{ + struct iphdr *iph; + struct udphdr *udph; + + /* UDP helpers might accidentally mangle the wrong packet */ + iph = (*pskb)->nh.iph; + if ((*pskb)->len < iph->ihl*4 + sizeof(*udph) + + match_offset + match_len) + return 0; + + if (!skb_ip_make_writable(pskb, (*pskb)->len)) + return 0; + + if (rep_len > match_len + && rep_len - match_len > skb_tailroom(*pskb) + && !enlarge_skb(pskb, rep_len - match_len)) + return 0; + + iph = (*pskb)->nh.iph; + udph = (void *)iph + iph->ihl*4; + mangle_contents(*pskb, iph->ihl*4 + sizeof(*udph), + match_offset, match_len, rep_buffer, rep_len); + + /* update the length of the UDP packet */ + udph->len = htons((*pskb)->len - iph->ihl*4); + + /* fix udp checksum if udp checksum was previously calculated */ + if (udph->check) { + int datalen = (*pskb)->len - iph->ihl * 4; + udph->check = 0; + udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + datalen, IPPROTO_UDP, + csum_partial((char *)udph, + datalen, 0)); + } + + return 1; +} + +/* Adjust one found SACK option including checksum correction */ +static void +sack_adjust(struct sk_buff *skb, + struct tcphdr *tcph, + unsigned int sackoff, + unsigned int sackend, + struct ip_nat_seq *natseq) +{ + while (sackoff < sackend) { + struct tcp_sack_block *sack; + u_int32_t new_start_seq, new_end_seq; + + sack = (void *)skb->data + sackoff; + if (after(ntohl(sack->start_seq) - natseq->offset_before, + natseq->correction_pos)) + new_start_seq = ntohl(sack->start_seq) + - natseq->offset_after; + else + new_start_seq = ntohl(sack->start_seq) + - natseq->offset_before; + new_start_seq = htonl(new_start_seq); + + if (after(ntohl(sack->end_seq) - natseq->offset_before, + natseq->correction_pos)) + new_end_seq = ntohl(sack->end_seq) + - natseq->offset_after; + else + new_end_seq = ntohl(sack->end_seq) + - natseq->offset_before; + new_end_seq = htonl(new_end_seq); + + DEBUGP("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n", + ntohl(sack->start_seq), new_start_seq, + ntohl(sack->end_seq), new_end_seq); + + tcph->check = + ip_nat_cheat_check(~sack->start_seq, new_start_seq, + ip_nat_cheat_check(~sack->end_seq, + new_end_seq, + tcph->check)); + sack->start_seq = new_start_seq; + sack->end_seq = new_end_seq; + sackoff += sizeof(*sack); + } +} + +/* TCP SACK sequence number adjustment */ +static inline unsigned int +ip_nat_sack_adjust(struct sk_buff **pskb, + struct tcphdr *tcph, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dir, optoff, optend; + + optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr); + optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4; + + if (!skb_ip_make_writable(pskb, optend)) + return 0; + + dir = CTINFO2DIR(ctinfo); + + while (optoff < optend) { + /* Usually: option, length. */ + unsigned char *op = (*pskb)->data + optoff; + + switch (op[0]) { + case TCPOPT_EOL: + return 1; + case TCPOPT_NOP: + optoff++; + continue; + default: + /* no partial options */ + if (optoff + 1 == optend + || optoff + op[1] > optend + || op[1] < 2) + return 0; + if (op[0] == TCPOPT_SACK + && op[1] >= 2+TCPOLEN_SACK_PERBLOCK + && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) + sack_adjust(*pskb, tcph, optoff+2, + optoff+op[1], + &ct->nat.info.seq[!dir]); + optoff += op[1]; + } + } + return 1; +} + +/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ +int +ip_nat_seq_adjust(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + struct tcphdr *tcph; + int dir, newseq, newack; + struct ip_nat_seq *this_way, *other_way; + + dir = CTINFO2DIR(ctinfo); + + this_way = &ct->nat.info.seq[dir]; + other_way = &ct->nat.info.seq[!dir]; + + if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) + return 0; + + tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; + if (after(ntohl(tcph->seq), this_way->correction_pos)) + newseq = ntohl(tcph->seq) + this_way->offset_after; + else + newseq = ntohl(tcph->seq) + this_way->offset_before; + newseq = htonl(newseq); + + if (after(ntohl(tcph->ack_seq) - other_way->offset_before, + other_way->correction_pos)) + newack = ntohl(tcph->ack_seq) - other_way->offset_after; + else + newack = ntohl(tcph->ack_seq) - other_way->offset_before; + newack = htonl(newack); + + tcph->check = ip_nat_cheat_check(~tcph->seq, newseq, + ip_nat_cheat_check(~tcph->ack_seq, + newack, + tcph->check)); + + DEBUGP("Adjusting sequence number from %u->%u, ack from %u->%u\n", + ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), + ntohl(newack)); + + tcph->seq = newseq; + tcph->ack_seq = newack; + + if (!ip_nat_sack_adjust(pskb, tcph, ct, ctinfo)) + return 0; + + ip_conntrack_tcp_update(*pskb, ct, dir); + + return 1; +} + +/* Setup NAT on this expected conntrack so it follows master. */ +/* If we fail to get a free NAT slot, we'll get dropped on confirm */ +void ip_nat_follow_master(struct ip_conntrack *ct, + struct ip_conntrack_expect *exp) +{ + struct ip_nat_range range; + + /* This must be a fresh one. */ + BUG_ON(ct->status & IPS_NAT_DONE_MASK); + + /* Change src to where master sends to */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.dst.ip; + /* hook doesn't matter, but it has to do source manip */ + ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + + /* For DST manip, map port here to where it's expected. */ + range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); + range.min = range.max = exp->saved_proto; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.src.ip; + /* hook doesn't matter, but it has to do destination manip */ + ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); +} diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c new file mode 100644 index 000000000000..9c1ca3381d56 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_irc.c @@ -0,0 +1,125 @@ +/* IRC extension for TCP NAT alteration. + * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org> + * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation + * based on a copy of RR's ip_nat_ftp.c + * + * ip_nat_irc.c,v 1.16 2001/12/06 07:42:10 laforge Exp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/module.h> +#include <linux/netfilter_ipv4.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/kernel.h> +#include <net/tcp.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_conntrack_irc.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/moduleparam.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); +MODULE_DESCRIPTION("IRC (DCC) NAT helper"); +MODULE_LICENSE("GPL"); + +static unsigned int help(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp) +{ + u_int16_t port; + unsigned int ret; + + /* "4294967296 65635 " */ + char buffer[18]; + + DEBUGP("IRC_NAT: info (seq %u + %u) in %u\n", + expect->seq, exp_irc_info->len, + ntohl(tcph->seq)); + + /* Reply comes from server. */ + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->dir = IP_CT_DIR_REPLY; + + /* When you see the packet, we need to NAT it the same as the + * this one. */ + exp->expectfn = ip_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + exp->tuple.dst.u.tcp.port = htons(port); + if (ip_conntrack_expect_related(exp) == 0) + break; + } + + if (port == 0) { + ip_conntrack_expect_free(exp); + return NF_DROP; + } + + /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27 + * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28 + * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26 + * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26 + * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27 + * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits, + * 255.255.255.255==4294967296, 10 digits) + * P: bound port (min 1 d, max 5d (65635)) + * F: filename (min 1 d ) + * S: size (min 1 d ) + * 0x01, \n: terminators + */ + + /* AAA = "us", ie. where server normally talks to. */ + sprintf(buffer, "%u %u", + ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip), + port); + DEBUGP("ip_nat_irc: Inserting '%s' == %u.%u.%u.%u, port %u\n", + buffer, NIPQUAD(exp->tuple.src.ip), port); + + ret = ip_nat_mangle_tcp_packet(pskb, exp->master, ctinfo, + matchoff, matchlen, buffer, + strlen(buffer)); + if (ret != NF_ACCEPT) + ip_conntrack_unexpect_related(exp); + return ret; +} + +static void __exit fini(void) +{ + ip_nat_irc_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); +} + +static int __init init(void) +{ + BUG_ON(ip_nat_irc_hook); + ip_nat_irc_hook = help; + return 0; +} + +/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ +static int warn_set(const char *val, struct kernel_param *kp) +{ + printk(KERN_INFO __stringify(KBUILD_MODNAME) + ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); + return 0; +} +module_param_call(ports, warn_set, NULL, NULL, 0); + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c new file mode 100644 index 000000000000..a558cf0eee8a --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c @@ -0,0 +1,115 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/icmp.h> +#include <linux/if.h> + +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> + +static int +icmp_in_range(const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype, + const union ip_conntrack_manip_proto *min, + const union ip_conntrack_manip_proto *max) +{ + return (tuple->src.u.icmp.id >= min->icmp.id + && tuple->src.u.icmp.id <= max->icmp.id); +} + +static int +icmp_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) +{ + static u_int16_t id; + unsigned int range_size + = (unsigned int)range->max.icmp.id - range->min.icmp.id + 1; + unsigned int i; + + /* If no range specified... */ + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) + range_size = 0xFFFF; + + for (i = 0; i < range_size; i++, id++) { + tuple->src.u.icmp.id = range->min.icmp.id + (id % range_size); + if (!ip_nat_used_tuple(tuple, conntrack)) + return 1; + } + return 0; +} + +static int +icmp_manip_pkt(struct sk_buff **pskb, + unsigned int iphdroff, + const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype) +{ + struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); + struct icmphdr *hdr; + unsigned int hdroff = iphdroff + iph->ihl*4; + + if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) + return 0; + + hdr = (struct icmphdr *)((*pskb)->data + hdroff); + + hdr->checksum = ip_nat_cheat_check(hdr->un.echo.id ^ 0xFFFF, + tuple->src.u.icmp.id, + hdr->checksum); + hdr->un.echo.id = tuple->src.u.icmp.id; + return 1; +} + +static unsigned int +icmp_print(char *buffer, + const struct ip_conntrack_tuple *match, + const struct ip_conntrack_tuple *mask) +{ + unsigned int len = 0; + + if (mask->src.u.icmp.id) + len += sprintf(buffer + len, "id=%u ", + ntohs(match->src.u.icmp.id)); + + if (mask->dst.u.icmp.type) + len += sprintf(buffer + len, "type=%u ", + ntohs(match->dst.u.icmp.type)); + + if (mask->dst.u.icmp.code) + len += sprintf(buffer + len, "code=%u ", + ntohs(match->dst.u.icmp.code)); + + return len; +} + +static unsigned int +icmp_print_range(char *buffer, const struct ip_nat_range *range) +{ + if (range->min.icmp.id != 0 || range->max.icmp.id != 0xFFFF) + return sprintf(buffer, "id %u-%u ", + ntohs(range->min.icmp.id), + ntohs(range->max.icmp.id)); + else return 0; +} + +struct ip_nat_protocol ip_nat_protocol_icmp += { "ICMP", IPPROTO_ICMP, + icmp_manip_pkt, + icmp_in_range, + icmp_unique_tuple, + icmp_print, + icmp_print_range +}; diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c new file mode 100644 index 000000000000..a91cfceff272 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c @@ -0,0 +1,178 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/if.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> + +static int +tcp_in_range(const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype, + const union ip_conntrack_manip_proto *min, + const union ip_conntrack_manip_proto *max) +{ + u_int16_t port; + + if (maniptype == IP_NAT_MANIP_SRC) + port = tuple->src.u.tcp.port; + else + port = tuple->dst.u.tcp.port; + + return ntohs(port) >= ntohs(min->tcp.port) + && ntohs(port) <= ntohs(max->tcp.port); +} + +static int +tcp_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) +{ + static u_int16_t port, *portptr; + unsigned int range_size, min, i; + + if (maniptype == IP_NAT_MANIP_SRC) + portptr = &tuple->src.u.tcp.port; + else + portptr = &tuple->dst.u.tcp.port; + + /* If no range specified... */ + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { + /* If it's dst rewrite, can't change port */ + if (maniptype == IP_NAT_MANIP_DST) + return 0; + + /* Map privileged onto privileged. */ + if (ntohs(*portptr) < 1024) { + /* Loose convention: >> 512 is credential passing */ + if (ntohs(*portptr)<512) { + min = 1; + range_size = 511 - min + 1; + } else { + min = 600; + range_size = 1023 - min + 1; + } + } else { + min = 1024; + range_size = 65535 - 1024 + 1; + } + } else { + min = ntohs(range->min.tcp.port); + range_size = ntohs(range->max.tcp.port) - min + 1; + } + + for (i = 0; i < range_size; i++, port++) { + *portptr = htons(min + port % range_size); + if (!ip_nat_used_tuple(tuple, conntrack)) { + return 1; + } + } + return 0; +} + +static int +tcp_manip_pkt(struct sk_buff **pskb, + unsigned int iphdroff, + const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype) +{ + struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); + struct tcphdr *hdr; + unsigned int hdroff = iphdroff + iph->ihl*4; + u32 oldip, newip; + u16 *portptr, newport, oldport; + int hdrsize = 8; /* TCP connection tracking guarantees this much */ + + /* this could be a inner header returned in icmp packet; in such + cases we cannot update the checksum field since it is outside of + the 8 bytes of transport layer headers we are guaranteed */ + if ((*pskb)->len >= hdroff + sizeof(struct tcphdr)) + hdrsize = sizeof(struct tcphdr); + + if (!skb_ip_make_writable(pskb, hdroff + hdrsize)) + return 0; + + iph = (struct iphdr *)((*pskb)->data + iphdroff); + hdr = (struct tcphdr *)((*pskb)->data + hdroff); + + if (maniptype == IP_NAT_MANIP_SRC) { + /* Get rid of src ip and src pt */ + oldip = iph->saddr; + newip = tuple->src.ip; + newport = tuple->src.u.tcp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst ip and dst pt */ + oldip = iph->daddr; + newip = tuple->dst.ip; + newport = tuple->dst.u.tcp.port; + portptr = &hdr->dest; + } + + oldport = *portptr; + *portptr = newport; + + if (hdrsize < sizeof(*hdr)) + return 1; + + hdr->check = ip_nat_cheat_check(~oldip, newip, + ip_nat_cheat_check(oldport ^ 0xFFFF, + newport, + hdr->check)); + return 1; +} + +static unsigned int +tcp_print(char *buffer, + const struct ip_conntrack_tuple *match, + const struct ip_conntrack_tuple *mask) +{ + unsigned int len = 0; + + if (mask->src.u.tcp.port) + len += sprintf(buffer + len, "srcpt=%u ", + ntohs(match->src.u.tcp.port)); + + + if (mask->dst.u.tcp.port) + len += sprintf(buffer + len, "dstpt=%u ", + ntohs(match->dst.u.tcp.port)); + + return len; +} + +static unsigned int +tcp_print_range(char *buffer, const struct ip_nat_range *range) +{ + if (range->min.tcp.port != 0 || range->max.tcp.port != 0xFFFF) { + if (range->min.tcp.port == range->max.tcp.port) + return sprintf(buffer, "port %u ", + ntohs(range->min.tcp.port)); + else + return sprintf(buffer, "ports %u-%u ", + ntohs(range->min.tcp.port), + ntohs(range->max.tcp.port)); + } + else return 0; +} + +struct ip_nat_protocol ip_nat_protocol_tcp += { "TCP", IPPROTO_TCP, + tcp_manip_pkt, + tcp_in_range, + tcp_unique_tuple, + tcp_print, + tcp_print_range +}; diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c new file mode 100644 index 000000000000..c669e3b5f5d0 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_proto_udp.c @@ -0,0 +1,165 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/netfilter.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <linux/if.h> + +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> + +static int +udp_in_range(const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype, + const union ip_conntrack_manip_proto *min, + const union ip_conntrack_manip_proto *max) +{ + u_int16_t port; + + if (maniptype == IP_NAT_MANIP_SRC) + port = tuple->src.u.udp.port; + else + port = tuple->dst.u.udp.port; + + return ntohs(port) >= ntohs(min->udp.port) + && ntohs(port) <= ntohs(max->udp.port); +} + +static int +udp_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) +{ + static u_int16_t port, *portptr; + unsigned int range_size, min, i; + + if (maniptype == IP_NAT_MANIP_SRC) + portptr = &tuple->src.u.udp.port; + else + portptr = &tuple->dst.u.udp.port; + + /* If no range specified... */ + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { + /* If it's dst rewrite, can't change port */ + if (maniptype == IP_NAT_MANIP_DST) + return 0; + + if (ntohs(*portptr) < 1024) { + /* Loose convention: >> 512 is credential passing */ + if (ntohs(*portptr)<512) { + min = 1; + range_size = 511 - min + 1; + } else { + min = 600; + range_size = 1023 - min + 1; + } + } else { + min = 1024; + range_size = 65535 - 1024 + 1; + } + } else { + min = ntohs(range->min.udp.port); + range_size = ntohs(range->max.udp.port) - min + 1; + } + + for (i = 0; i < range_size; i++, port++) { + *portptr = htons(min + port % range_size); + if (!ip_nat_used_tuple(tuple, conntrack)) + return 1; + } + return 0; +} + +static int +udp_manip_pkt(struct sk_buff **pskb, + unsigned int iphdroff, + const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype) +{ + struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); + struct udphdr *hdr; + unsigned int hdroff = iphdroff + iph->ihl*4; + u32 oldip, newip; + u16 *portptr, newport; + + if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) + return 0; + + iph = (struct iphdr *)((*pskb)->data + iphdroff); + hdr = (struct udphdr *)((*pskb)->data + hdroff); + + if (maniptype == IP_NAT_MANIP_SRC) { + /* Get rid of src ip and src pt */ + oldip = iph->saddr; + newip = tuple->src.ip; + newport = tuple->src.u.udp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst ip and dst pt */ + oldip = iph->daddr; + newip = tuple->dst.ip; + newport = tuple->dst.u.udp.port; + portptr = &hdr->dest; + } + if (hdr->check) /* 0 is a special case meaning no checksum */ + hdr->check = ip_nat_cheat_check(~oldip, newip, + ip_nat_cheat_check(*portptr ^ 0xFFFF, + newport, + hdr->check)); + *portptr = newport; + return 1; +} + +static unsigned int +udp_print(char *buffer, + const struct ip_conntrack_tuple *match, + const struct ip_conntrack_tuple *mask) +{ + unsigned int len = 0; + + if (mask->src.u.udp.port) + len += sprintf(buffer + len, "srcpt=%u ", + ntohs(match->src.u.udp.port)); + + + if (mask->dst.u.udp.port) + len += sprintf(buffer + len, "dstpt=%u ", + ntohs(match->dst.u.udp.port)); + + return len; +} + +static unsigned int +udp_print_range(char *buffer, const struct ip_nat_range *range) +{ + if (range->min.udp.port != 0 || range->max.udp.port != 0xFFFF) { + if (range->min.udp.port == range->max.udp.port) + return sprintf(buffer, "port %u ", + ntohs(range->min.udp.port)); + else + return sprintf(buffer, "ports %u-%u ", + ntohs(range->min.udp.port), + ntohs(range->max.udp.port)); + } + else return 0; +} + +struct ip_nat_protocol ip_nat_protocol_udp += { "UDP", IPPROTO_UDP, + udp_manip_pkt, + udp_in_range, + udp_unique_tuple, + udp_print, + udp_print_range +}; diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c new file mode 100644 index 000000000000..f5525bd58d16 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c @@ -0,0 +1,70 @@ +/* The "unknown" protocol. This is what is used for protocols we + * don't understand. It's returned by ip_ct_find_proto(). + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/netfilter.h> +#include <linux/if.h> + +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> + +static int unknown_in_range(const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type manip_type, + const union ip_conntrack_manip_proto *min, + const union ip_conntrack_manip_proto *max) +{ + return 1; +} + +static int unknown_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) +{ + /* Sorry: we can't help you; if it's not unique, we can't frob + anything. */ + return 0; +} + +static int +unknown_manip_pkt(struct sk_buff **pskb, + unsigned int iphdroff, + const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype) +{ + return 1; +} + +static unsigned int +unknown_print(char *buffer, + const struct ip_conntrack_tuple *match, + const struct ip_conntrack_tuple *mask) +{ + return 0; +} + +static unsigned int +unknown_print_range(char *buffer, const struct ip_nat_range *range) +{ + return 0; +} + +struct ip_nat_protocol ip_nat_unknown_protocol = { + "unknown", 0, + unknown_manip_pkt, + unknown_in_range, + unknown_unique_tuple, + unknown_print, + unknown_print_range +}; diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c new file mode 100644 index 000000000000..581f097f5a24 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_rule.c @@ -0,0 +1,319 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Everything about the rules for NAT. */ +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/module.h> +#include <linux/kmod.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <net/checksum.h> +#include <net/route.h> +#include <linux/bitops.h> + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +#define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT)) + +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[3]; + struct ipt_error term; +} nat_initial_table __initdata += { { "nat", NAT_VALID_HOOKS, 4, + sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), + { [NF_IP_PRE_ROUTING] = 0, + [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard), + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 }, + { [NF_IP_PRE_ROUTING] = 0, + [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard), + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 }, + 0, NULL, { } }, + { + /* PRE_ROUTING */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* POST_ROUTING */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_OUT */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } } + }, + /* ERROR */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_error), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } }, + { } }, + "ERROR" + } + } +}; + +static struct ipt_table nat_table = { + .name = "nat", + .valid_hooks = NAT_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE, +}; + +/* Source NAT */ +static unsigned int ipt_snat_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + const struct ip_nat_multi_range_compat *mr = targinfo; + + IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); + + ct = ip_conntrack_get(*pskb, &ctinfo); + + /* Connection must be valid and new. */ + IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED + || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); + IP_NF_ASSERT(out); + + return ip_nat_setup_info(ct, &mr->range[0], hooknum); +} + +/* Before 2.6.11 we did implicit source NAT if required. Warn about change. */ +static void warn_if_extra_mangle(u32 dstip, u32 srcip) +{ + static int warned = 0; + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } }; + struct rtable *rt; + + if (ip_route_output_key(&rt, &fl) != 0) + return; + + if (rt->rt_src != srcip && !warned) { + printk("NAT: no longer support implicit source local NAT\n"); + printk("NAT: packet src %u.%u.%u.%u -> dst %u.%u.%u.%u\n", + NIPQUAD(srcip), NIPQUAD(dstip)); + warned = 1; + } + ip_rt_put(rt); +} + +static unsigned int ipt_dnat_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + const struct ip_nat_multi_range_compat *mr = targinfo; + + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + || hooknum == NF_IP_LOCAL_OUT); + + ct = ip_conntrack_get(*pskb, &ctinfo); + + /* Connection must be valid and new. */ + IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + + if (hooknum == NF_IP_LOCAL_OUT + && mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) + warn_if_extra_mangle((*pskb)->nh.iph->daddr, + mr->range[0].min_ip); + + return ip_nat_setup_info(ct, &mr->range[0], hooknum); +} + +static int ipt_snat_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ip_nat_multi_range_compat *mr = targinfo; + + /* Must be a valid range */ + if (mr->rangesize != 1) { + printk("SNAT: multiple ranges no longer supported\n"); + return 0; + } + + if (targinfosize != IPT_ALIGN(sizeof(struct ip_nat_multi_range_compat))) { + DEBUGP("SNAT: Target size %u wrong for %u ranges\n", + targinfosize, mr->rangesize); + return 0; + } + + /* Only allow these for NAT. */ + if (strcmp(tablename, "nat") != 0) { + DEBUGP("SNAT: wrong table %s\n", tablename); + return 0; + } + + if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) { + DEBUGP("SNAT: hook mask 0x%x bad\n", hook_mask); + return 0; + } + return 1; +} + +static int ipt_dnat_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ip_nat_multi_range_compat *mr = targinfo; + + /* Must be a valid range */ + if (mr->rangesize != 1) { + printk("DNAT: multiple ranges no longer supported\n"); + return 0; + } + + if (targinfosize != IPT_ALIGN(sizeof(struct ip_nat_multi_range_compat))) { + DEBUGP("DNAT: Target size %u wrong for %u ranges\n", + targinfosize, mr->rangesize); + return 0; + } + + /* Only allow these for NAT. */ + if (strcmp(tablename, "nat") != 0) { + DEBUGP("DNAT: wrong table %s\n", tablename); + return 0; + } + + if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) { + DEBUGP("DNAT: hook mask 0x%x bad\n", hook_mask); + return 0; + } + + return 1; +} + +inline unsigned int +alloc_null_binding(struct ip_conntrack *conntrack, + struct ip_nat_info *info, + unsigned int hooknum) +{ + /* Force range to this IP; let proto decide mapping for + per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). + Use reply in case it's already been mangled (eg local packet). + */ + u_int32_t ip + = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC + ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip + : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip); + struct ip_nat_range range + = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } }; + + DEBUGP("Allocating NULL binding for %p (%u.%u.%u.%u)\n", conntrack, + NIPQUAD(ip)); + return ip_nat_setup_info(conntrack, &range, hooknum); +} + +int ip_nat_rule_find(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + struct ip_conntrack *ct, + struct ip_nat_info *info) +{ + int ret; + + ret = ipt_do_table(pskb, hooknum, in, out, &nat_table, NULL); + + if (ret == NF_ACCEPT) { + if (!ip_nat_initialized(ct, HOOK2MANIP(hooknum))) + /* NUL mapping */ + ret = alloc_null_binding(ct, info, hooknum); + } + return ret; +} + +static struct ipt_target ipt_snat_reg = { + .name = "SNAT", + .target = ipt_snat_target, + .checkentry = ipt_snat_checkentry, +}; + +static struct ipt_target ipt_dnat_reg = { + .name = "DNAT", + .target = ipt_dnat_target, + .checkentry = ipt_dnat_checkentry, +}; + +int __init ip_nat_rule_init(void) +{ + int ret; + + ret = ipt_register_table(&nat_table, &nat_initial_table.repl); + if (ret != 0) + return ret; + ret = ipt_register_target(&ipt_snat_reg); + if (ret != 0) + goto unregister_table; + + ret = ipt_register_target(&ipt_dnat_reg); + if (ret != 0) + goto unregister_snat; + + return ret; + + unregister_snat: + ipt_unregister_target(&ipt_snat_reg); + unregister_table: + ipt_unregister_table(&nat_table); + + return ret; +} + +void ip_nat_rule_cleanup(void) +{ + ipt_unregister_target(&ipt_dnat_reg); + ipt_unregister_target(&ipt_snat_reg); + ipt_unregister_table(&nat_table); +} diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c new file mode 100644 index 000000000000..2a48b6e635ae --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -0,0 +1,1347 @@ +/* + * ip_nat_snmp_basic.c + * + * Basic SNMP Application Layer Gateway + * + * This IP NAT module is intended for use with SNMP network + * discovery and monitoring applications where target networks use + * conflicting private address realms. + * + * Static NAT is used to remap the networks from the view of the network + * management system at the IP layer, and this module remaps some application + * layer addresses to match. + * + * The simplest form of ALG is performed, where only tagged IP addresses + * are modified. The module does not need to be MIB aware and only scans + * messages at the ASN.1/BER level. + * + * Currently, only SNMPv1 and SNMPv2 are supported. + * + * More information on ALG and associated issues can be found in + * RFC 2962 + * + * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory + * McLean & Jochen Friedrich, stripped down for use in the kernel. + * + * Copyright (c) 2000 RP Internet (www.rpi.net.au). + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: James Morris <jmorris@intercode.com.au> + * + * Updates: + * 2000-08-06: Convert to new helper API (Harald Welte). + * + */ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/moduleparam.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/ip.h> +#include <net/checksum.h> +#include <net/udp.h> +#include <asm/uaccess.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); +MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway"); + +#define SNMP_PORT 161 +#define SNMP_TRAP_PORT 162 +#define NOCT1(n) (u_int8_t )((n) & 0xff) + +static int debug; +static DEFINE_SPINLOCK(snmp_lock); + +/* + * Application layer address mapping mimics the NAT mapping, but + * only for the first octet in this case (a more flexible system + * can be implemented if needed). + */ +struct oct1_map +{ + u_int8_t from; + u_int8_t to; +}; + + +/***************************************************************************** + * + * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse) + * + *****************************************************************************/ + +/* Class */ +#define ASN1_UNI 0 /* Universal */ +#define ASN1_APL 1 /* Application */ +#define ASN1_CTX 2 /* Context */ +#define ASN1_PRV 3 /* Private */ + +/* Tag */ +#define ASN1_EOC 0 /* End Of Contents */ +#define ASN1_BOL 1 /* Boolean */ +#define ASN1_INT 2 /* Integer */ +#define ASN1_BTS 3 /* Bit String */ +#define ASN1_OTS 4 /* Octet String */ +#define ASN1_NUL 5 /* Null */ +#define ASN1_OJI 6 /* Object Identifier */ +#define ASN1_OJD 7 /* Object Description */ +#define ASN1_EXT 8 /* External */ +#define ASN1_SEQ 16 /* Sequence */ +#define ASN1_SET 17 /* Set */ +#define ASN1_NUMSTR 18 /* Numerical String */ +#define ASN1_PRNSTR 19 /* Printable String */ +#define ASN1_TEXSTR 20 /* Teletext String */ +#define ASN1_VIDSTR 21 /* Video String */ +#define ASN1_IA5STR 22 /* IA5 String */ +#define ASN1_UNITIM 23 /* Universal Time */ +#define ASN1_GENTIM 24 /* General Time */ +#define ASN1_GRASTR 25 /* Graphical String */ +#define ASN1_VISSTR 26 /* Visible String */ +#define ASN1_GENSTR 27 /* General String */ + +/* Primitive / Constructed methods*/ +#define ASN1_PRI 0 /* Primitive */ +#define ASN1_CON 1 /* Constructed */ + +/* + * Error codes. + */ +#define ASN1_ERR_NOERROR 0 +#define ASN1_ERR_DEC_EMPTY 2 +#define ASN1_ERR_DEC_EOC_MISMATCH 3 +#define ASN1_ERR_DEC_LENGTH_MISMATCH 4 +#define ASN1_ERR_DEC_BADVALUE 5 + +/* + * ASN.1 context. + */ +struct asn1_ctx +{ + int error; /* Error condition */ + unsigned char *pointer; /* Octet just to be decoded */ + unsigned char *begin; /* First octet */ + unsigned char *end; /* Octet after last octet */ +}; + +/* + * Octet string (not null terminated) + */ +struct asn1_octstr +{ + unsigned char *data; + unsigned int len; +}; + +static void asn1_open(struct asn1_ctx *ctx, + unsigned char *buf, + unsigned int len) +{ + ctx->begin = buf; + ctx->end = buf + len; + ctx->pointer = buf; + ctx->error = ASN1_ERR_NOERROR; +} + +static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch) +{ + if (ctx->pointer >= ctx->end) { + ctx->error = ASN1_ERR_DEC_EMPTY; + return 0; + } + *ch = *(ctx->pointer)++; + return 1; +} + +static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag) +{ + unsigned char ch; + + *tag = 0; + + do + { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + *tag <<= 7; + *tag |= ch & 0x7F; + } while ((ch & 0x80) == 0x80); + return 1; +} + +static unsigned char asn1_id_decode(struct asn1_ctx *ctx, + unsigned int *cls, + unsigned int *con, + unsigned int *tag) +{ + unsigned char ch; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *cls = (ch & 0xC0) >> 6; + *con = (ch & 0x20) >> 5; + *tag = (ch & 0x1F); + + if (*tag == 0x1F) { + if (!asn1_tag_decode(ctx, tag)) + return 0; + } + return 1; +} + +static unsigned char asn1_length_decode(struct asn1_ctx *ctx, + unsigned int *def, + unsigned int *len) +{ + unsigned char ch, cnt; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + if (ch == 0x80) + *def = 0; + else { + *def = 1; + + if (ch < 0x80) + *len = ch; + else { + cnt = (unsigned char) (ch & 0x7F); + *len = 0; + + while (cnt > 0) { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + *len <<= 8; + *len |= ch; + cnt--; + } + } + } + return 1; +} + +static unsigned char asn1_header_decode(struct asn1_ctx *ctx, + unsigned char **eoc, + unsigned int *cls, + unsigned int *con, + unsigned int *tag) +{ + unsigned int def, len; + + if (!asn1_id_decode(ctx, cls, con, tag)) + return 0; + + if (!asn1_length_decode(ctx, &def, &len)) + return 0; + + if (def) + *eoc = ctx->pointer + len; + else + *eoc = NULL; + return 1; +} + +static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc) +{ + unsigned char ch; + + if (eoc == 0) { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + if (ch != 0x00) { + ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + if (ch != 0x00) { + ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; + return 0; + } + return 1; + } else { + if (ctx->pointer != eoc) { + ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH; + return 0; + } + return 1; + } +} + +static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc) +{ + ctx->pointer = eoc; + return 1; +} + +static unsigned char asn1_long_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + long *integer) +{ + unsigned char ch; + unsigned int len; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer = (signed char) ch; + len = 1; + + while (ctx->pointer < eoc) { + if (++len > sizeof (long)) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer <<= 8; + *integer |= ch; + } + return 1; +} + +static unsigned char asn1_uint_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned int *integer) +{ + unsigned char ch; + unsigned int len; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer = ch; + if (ch == 0) len = 0; + else len = 1; + + while (ctx->pointer < eoc) { + if (++len > sizeof (unsigned int)) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer <<= 8; + *integer |= ch; + } + return 1; +} + +static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned long *integer) +{ + unsigned char ch; + unsigned int len; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer = ch; + if (ch == 0) len = 0; + else len = 1; + + while (ctx->pointer < eoc) { + if (++len > sizeof (unsigned long)) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer <<= 8; + *integer |= ch; + } + return 1; +} + +static unsigned char asn1_octets_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned char **octets, + unsigned int *len) +{ + unsigned char *ptr; + + *len = 0; + + *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); + if (*octets == NULL) { + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + + ptr = *octets; + while (ctx->pointer < eoc) { + if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) { + kfree(*octets); + *octets = NULL; + return 0; + } + (*len)++; + } + return 1; +} + +static unsigned char asn1_subid_decode(struct asn1_ctx *ctx, + unsigned long *subid) +{ + unsigned char ch; + + *subid = 0; + + do { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *subid <<= 7; + *subid |= ch & 0x7F; + } while ((ch & 0x80) == 0x80); + return 1; +} + +static unsigned char asn1_oid_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned long **oid, + unsigned int *len) +{ + unsigned long subid; + unsigned int size; + unsigned long *optr; + + size = eoc - ctx->pointer + 1; + *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); + if (*oid == NULL) { + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + + optr = *oid; + + if (!asn1_subid_decode(ctx, &subid)) { + kfree(*oid); + *oid = NULL; + return 0; + } + + if (subid < 40) { + optr [0] = 0; + optr [1] = subid; + } else if (subid < 80) { + optr [0] = 1; + optr [1] = subid - 40; + } else { + optr [0] = 2; + optr [1] = subid - 80; + } + + *len = 2; + optr += 2; + + while (ctx->pointer < eoc) { + if (++(*len) > size) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + kfree(*oid); + *oid = NULL; + return 0; + } + + if (!asn1_subid_decode(ctx, optr++)) { + kfree(*oid); + *oid = NULL; + return 0; + } + } + return 1; +} + +/***************************************************************************** + * + * SNMP decoding routines (gxsnmp author Dirk Wisse) + * + *****************************************************************************/ + +/* SNMP Versions */ +#define SNMP_V1 0 +#define SNMP_V2C 1 +#define SNMP_V2 2 +#define SNMP_V3 3 + +/* Default Sizes */ +#define SNMP_SIZE_COMM 256 +#define SNMP_SIZE_OBJECTID 128 +#define SNMP_SIZE_BUFCHR 256 +#define SNMP_SIZE_BUFINT 128 +#define SNMP_SIZE_SMALLOBJECTID 16 + +/* Requests */ +#define SNMP_PDU_GET 0 +#define SNMP_PDU_NEXT 1 +#define SNMP_PDU_RESPONSE 2 +#define SNMP_PDU_SET 3 +#define SNMP_PDU_TRAP1 4 +#define SNMP_PDU_BULK 5 +#define SNMP_PDU_INFORM 6 +#define SNMP_PDU_TRAP2 7 + +/* Errors */ +#define SNMP_NOERROR 0 +#define SNMP_TOOBIG 1 +#define SNMP_NOSUCHNAME 2 +#define SNMP_BADVALUE 3 +#define SNMP_READONLY 4 +#define SNMP_GENERROR 5 +#define SNMP_NOACCESS 6 +#define SNMP_WRONGTYPE 7 +#define SNMP_WRONGLENGTH 8 +#define SNMP_WRONGENCODING 9 +#define SNMP_WRONGVALUE 10 +#define SNMP_NOCREATION 11 +#define SNMP_INCONSISTENTVALUE 12 +#define SNMP_RESOURCEUNAVAILABLE 13 +#define SNMP_COMMITFAILED 14 +#define SNMP_UNDOFAILED 15 +#define SNMP_AUTHORIZATIONERROR 16 +#define SNMP_NOTWRITABLE 17 +#define SNMP_INCONSISTENTNAME 18 + +/* General SNMP V1 Traps */ +#define SNMP_TRAP_COLDSTART 0 +#define SNMP_TRAP_WARMSTART 1 +#define SNMP_TRAP_LINKDOWN 2 +#define SNMP_TRAP_LINKUP 3 +#define SNMP_TRAP_AUTFAILURE 4 +#define SNMP_TRAP_EQPNEIGHBORLOSS 5 +#define SNMP_TRAP_ENTSPECIFIC 6 + +/* SNMPv1 Types */ +#define SNMP_NULL 0 +#define SNMP_INTEGER 1 /* l */ +#define SNMP_OCTETSTR 2 /* c */ +#define SNMP_DISPLAYSTR 2 /* c */ +#define SNMP_OBJECTID 3 /* ul */ +#define SNMP_IPADDR 4 /* uc */ +#define SNMP_COUNTER 5 /* ul */ +#define SNMP_GAUGE 6 /* ul */ +#define SNMP_TIMETICKS 7 /* ul */ +#define SNMP_OPAQUE 8 /* c */ + +/* Additional SNMPv2 Types */ +#define SNMP_UINTEGER 5 /* ul */ +#define SNMP_BITSTR 9 /* uc */ +#define SNMP_NSAP 10 /* uc */ +#define SNMP_COUNTER64 11 /* ul */ +#define SNMP_NOSUCHOBJECT 12 +#define SNMP_NOSUCHINSTANCE 13 +#define SNMP_ENDOFMIBVIEW 14 + +union snmp_syntax +{ + unsigned char uc[0]; /* 8 bit unsigned */ + char c[0]; /* 8 bit signed */ + unsigned long ul[0]; /* 32 bit unsigned */ + long l[0]; /* 32 bit signed */ +}; + +struct snmp_object +{ + unsigned long *id; + unsigned int id_len; + unsigned short type; + unsigned int syntax_len; + union snmp_syntax syntax; +}; + +struct snmp_request +{ + unsigned long id; + unsigned int error_status; + unsigned int error_index; +}; + +struct snmp_v1_trap +{ + unsigned long *id; + unsigned int id_len; + unsigned long ip_address; /* pointer */ + unsigned int general; + unsigned int specific; + unsigned long time; +}; + +/* SNMP types */ +#define SNMP_IPA 0 +#define SNMP_CNT 1 +#define SNMP_GGE 2 +#define SNMP_TIT 3 +#define SNMP_OPQ 4 +#define SNMP_C64 6 + +/* SNMP errors */ +#define SERR_NSO 0 +#define SERR_NSI 1 +#define SERR_EOM 2 + +static inline void mangle_address(unsigned char *begin, + unsigned char *addr, + const struct oct1_map *map, + u_int16_t *check); +struct snmp_cnv +{ + unsigned int class; + unsigned int tag; + int syntax; +}; + +static struct snmp_cnv snmp_conv [] = +{ + {ASN1_UNI, ASN1_NUL, SNMP_NULL}, + {ASN1_UNI, ASN1_INT, SNMP_INTEGER}, + {ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR}, + {ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR}, + {ASN1_UNI, ASN1_OJI, SNMP_OBJECTID}, + {ASN1_APL, SNMP_IPA, SNMP_IPADDR}, + {ASN1_APL, SNMP_CNT, SNMP_COUNTER}, /* Counter32 */ + {ASN1_APL, SNMP_GGE, SNMP_GAUGE}, /* Gauge32 == Unsigned32 */ + {ASN1_APL, SNMP_TIT, SNMP_TIMETICKS}, + {ASN1_APL, SNMP_OPQ, SNMP_OPAQUE}, + + /* SNMPv2 data types and errors */ + {ASN1_UNI, ASN1_BTS, SNMP_BITSTR}, + {ASN1_APL, SNMP_C64, SNMP_COUNTER64}, + {ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT}, + {ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE}, + {ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW}, + {0, 0, -1} +}; + +static unsigned char snmp_tag_cls2syntax(unsigned int tag, + unsigned int cls, + unsigned short *syntax) +{ + struct snmp_cnv *cnv; + + cnv = snmp_conv; + + while (cnv->syntax != -1) { + if (cnv->tag == tag && cnv->class == cls) { + *syntax = cnv->syntax; + return 1; + } + cnv++; + } + return 0; +} + +static unsigned char snmp_object_decode(struct asn1_ctx *ctx, + struct snmp_object **obj) +{ + unsigned int cls, con, tag, len, idlen; + unsigned short type; + unsigned char *eoc, *end, *p; + unsigned long *lp, *id; + unsigned long ul; + long l; + + *obj = NULL; + id = NULL; + + if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI) + return 0; + + if (!asn1_oid_decode(ctx, end, &id, &idlen)) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) { + kfree(id); + return 0; + } + + if (con != ASN1_PRI) { + kfree(id); + return 0; + } + + if (!snmp_tag_cls2syntax(tag, cls, &type)) { + kfree(id); + return 0; + } + + switch (type) { + case SNMP_INTEGER: + len = sizeof(long); + if (!asn1_long_decode(ctx, end, &l)) { + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, + GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + (*obj)->syntax.l[0] = l; + break; + case SNMP_OCTETSTR: + case SNMP_OPAQUE: + if (!asn1_octets_decode(ctx, end, &p, &len)) { + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, + GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + memcpy((*obj)->syntax.c, p, len); + kfree(p); + break; + case SNMP_NULL: + case SNMP_NOSUCHOBJECT: + case SNMP_NOSUCHINSTANCE: + case SNMP_ENDOFMIBVIEW: + len = 0; + *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + if (!asn1_null_decode(ctx, end)) { + kfree(id); + kfree(*obj); + *obj = NULL; + return 0; + } + break; + case SNMP_OBJECTID: + if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) { + kfree(id); + return 0; + } + len *= sizeof(unsigned long); + *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + memcpy((*obj)->syntax.ul, lp, len); + kfree(lp); + break; + case SNMP_IPADDR: + if (!asn1_octets_decode(ctx, end, &p, &len)) { + kfree(id); + return 0; + } + if (len != 4) { + kfree(p); + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); + if (*obj == NULL) { + kfree(p); + kfree(id); + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + memcpy((*obj)->syntax.uc, p, len); + kfree(p); + break; + case SNMP_COUNTER: + case SNMP_GAUGE: + case SNMP_TIMETICKS: + len = sizeof(unsigned long); + if (!asn1_ulong_decode(ctx, end, &ul)) { + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + (*obj)->syntax.ul[0] = ul; + break; + default: + kfree(id); + return 0; + } + + (*obj)->syntax_len = len; + (*obj)->type = type; + (*obj)->id = id; + (*obj)->id_len = idlen; + + if (!asn1_eoc_decode(ctx, eoc)) { + kfree(id); + kfree(*obj); + *obj = NULL; + return 0; + } + return 1; +} + +static unsigned char snmp_request_decode(struct asn1_ctx *ctx, + struct snmp_request *request) +{ + unsigned int cls, con, tag; + unsigned char *end; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + return 0; + + if (!asn1_ulong_decode(ctx, end, &request->id)) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + return 0; + + if (!asn1_uint_decode(ctx, end, &request->error_status)) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + return 0; + + if (!asn1_uint_decode(ctx, end, &request->error_index)) + return 0; + + return 1; +} + +/* + * Fast checksum update for possibly oddly-aligned UDP byte, from the + * code example in the draft. + */ +static void fast_csum(unsigned char *csum, + const unsigned char *optr, + const unsigned char *nptr, + int odd) +{ + long x, old, new; + + x = csum[0] * 256 + csum[1]; + + x =~ x & 0xFFFF; + + if (odd) old = optr[0] * 256; + else old = optr[0]; + + x -= old & 0xFFFF; + if (x <= 0) { + x--; + x &= 0xFFFF; + } + + if (odd) new = nptr[0] * 256; + else new = nptr[0]; + + x += new & 0xFFFF; + if (x & 0x10000) { + x++; + x &= 0xFFFF; + } + + x =~ x & 0xFFFF; + csum[0] = x / 256; + csum[1] = x & 0xFF; +} + +/* + * Mangle IP address. + * - begin points to the start of the snmp messgae + * - addr points to the start of the address + */ +static inline void mangle_address(unsigned char *begin, + unsigned char *addr, + const struct oct1_map *map, + u_int16_t *check) +{ + if (map->from == NOCT1(*addr)) { + u_int32_t old; + + if (debug) + memcpy(&old, (unsigned char *)addr, sizeof(old)); + + *addr = map->to; + + /* Update UDP checksum if being used */ + if (*check) { + unsigned char odd = !((addr - begin) % 2); + + fast_csum((unsigned char *)check, + &map->from, &map->to, odd); + + } + + if (debug) + printk(KERN_DEBUG "bsalg: mapped %u.%u.%u.%u to " + "%u.%u.%u.%u\n", NIPQUAD(old), NIPQUAD(*addr)); + } +} + +static unsigned char snmp_trap_decode(struct asn1_ctx *ctx, + struct snmp_v1_trap *trap, + const struct oct1_map *map, + u_int16_t *check) +{ + unsigned int cls, con, tag, len; + unsigned char *end; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI) + return 0; + + if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len)) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + goto err_id_free; + + if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) || + (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS))) + goto err_id_free; + + if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len)) + goto err_id_free; + + /* IPv4 only */ + if (len != 4) + goto err_addr_free; + + mangle_address(ctx->begin, ctx->pointer - 4, map, check); + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + goto err_addr_free; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + goto err_addr_free; + + if (!asn1_uint_decode(ctx, end, &trap->general)) + goto err_addr_free; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + goto err_addr_free; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + goto err_addr_free; + + if (!asn1_uint_decode(ctx, end, &trap->specific)) + goto err_addr_free; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + goto err_addr_free; + + if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) || + (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT))) + goto err_addr_free; + + if (!asn1_ulong_decode(ctx, end, &trap->time)) + goto err_addr_free; + + return 1; + +err_id_free: + kfree(trap->id); + +err_addr_free: + kfree((unsigned long *)trap->ip_address); + + return 0; +} + +/***************************************************************************** + * + * Misc. routines + * + *****************************************************************************/ + +static void hex_dump(unsigned char *buf, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + if (i && !(i % 16)) + printk("\n"); + printk("%02x ", *(buf + i)); + } + printk("\n"); +} + +/* + * Parse and mangle SNMP message according to mapping. + * (And this is the fucking 'basic' method). + */ +static int snmp_parse_mangle(unsigned char *msg, + u_int16_t len, + const struct oct1_map *map, + u_int16_t *check) +{ + unsigned char *eoc, *end; + unsigned int cls, con, tag, vers, pdutype; + struct asn1_ctx ctx; + struct asn1_octstr comm; + struct snmp_object **obj; + + if (debug > 1) + hex_dump(msg, len); + + asn1_open(&ctx, msg, len); + + /* + * Start of SNMP message. + */ + if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag)) + return 0; + if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) + return 0; + + /* + * Version 1 or 2 handled. + */ + if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag)) + return 0; + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + return 0; + if (!asn1_uint_decode (&ctx, end, &vers)) + return 0; + if (debug > 1) + printk(KERN_DEBUG "bsalg: snmp version: %u\n", vers + 1); + if (vers > 1) + return 1; + + /* + * Community. + */ + if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag)) + return 0; + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS) + return 0; + if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len)) + return 0; + if (debug > 1) { + unsigned int i; + + printk(KERN_DEBUG "bsalg: community: "); + for (i = 0; i < comm.len; i++) + printk("%c", comm.data[i]); + printk("\n"); + } + kfree(comm.data); + + /* + * PDU type + */ + if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype)) + return 0; + if (cls != ASN1_CTX || con != ASN1_CON) + return 0; + if (debug > 1) { + unsigned char *pdus[] = { + [SNMP_PDU_GET] = "get", + [SNMP_PDU_NEXT] = "get-next", + [SNMP_PDU_RESPONSE] = "response", + [SNMP_PDU_SET] = "set", + [SNMP_PDU_TRAP1] = "trapv1", + [SNMP_PDU_BULK] = "bulk", + [SNMP_PDU_INFORM] = "inform", + [SNMP_PDU_TRAP2] = "trapv2" + }; + + if (pdutype > SNMP_PDU_TRAP2) + printk(KERN_DEBUG "bsalg: bad pdu type %u\n", pdutype); + else + printk(KERN_DEBUG "bsalg: pdu: %s\n", pdus[pdutype]); + } + if (pdutype != SNMP_PDU_RESPONSE && + pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2) + return 1; + + /* + * Request header or v1 trap + */ + if (pdutype == SNMP_PDU_TRAP1) { + struct snmp_v1_trap trap; + unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check); + + /* Discard trap allocations regardless */ + kfree(trap.id); + kfree((unsigned long *)trap.ip_address); + + if (!ret) + return ret; + + } else { + struct snmp_request req; + + if (!snmp_request_decode(&ctx, &req)) + return 0; + + if (debug > 1) + printk(KERN_DEBUG "bsalg: request: id=0x%lx error_status=%u " + "error_index=%u\n", req.id, req.error_status, + req.error_index); + } + + /* + * Loop through objects, look for IP addresses to mangle. + */ + if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) + return 0; + + obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); + if (obj == NULL) { + if (net_ratelimit()) + printk(KERN_WARNING "OOM in bsalg(%d)\n", __LINE__); + return 0; + } + + while (!asn1_eoc_decode(&ctx, eoc)) { + unsigned int i; + + if (!snmp_object_decode(&ctx, obj)) { + if (*obj) { + if ((*obj)->id) + kfree((*obj)->id); + kfree(*obj); + } + kfree(obj); + return 0; + } + + if (debug > 1) { + printk(KERN_DEBUG "bsalg: object: "); + for (i = 0; i < (*obj)->id_len; i++) { + if (i > 0) + printk("."); + printk("%lu", (*obj)->id[i]); + } + printk(": type=%u\n", (*obj)->type); + + } + + if ((*obj)->type == SNMP_IPADDR) + mangle_address(ctx.begin, ctx.pointer - 4 , map, check); + + kfree((*obj)->id); + kfree(*obj); + } + kfree(obj); + + if (!asn1_eoc_decode(&ctx, eoc)) + return 0; + + return 1; +} + +/***************************************************************************** + * + * NAT routines. + * + *****************************************************************************/ + +/* + * SNMP translation routine. + */ +static int snmp_translate(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + struct sk_buff **pskb) +{ + struct iphdr *iph = (*pskb)->nh.iph; + struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl); + u_int16_t udplen = ntohs(udph->len); + u_int16_t paylen = udplen - sizeof(struct udphdr); + int dir = CTINFO2DIR(ctinfo); + struct oct1_map map; + + /* + * Determine mappping for application layer addresses based + * on NAT manipulations for the packet. + */ + if (dir == IP_CT_DIR_ORIGINAL) { + /* SNAT traps */ + map.from = NOCT1(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip); + map.to = NOCT1(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip); + } else { + /* DNAT replies */ + map.from = NOCT1(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip); + map.to = NOCT1(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip); + } + + if (map.from == map.to) + return NF_ACCEPT; + + if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr), + paylen, &map, &udph->check)) { + if (net_ratelimit()) + printk(KERN_WARNING "bsalg: parser failed\n"); + return NF_DROP; + } + return NF_ACCEPT; +} + +/* We don't actually set up expectations, just adjust internal IP + * addresses if this is being NATted */ +static int help(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + int dir = CTINFO2DIR(ctinfo); + unsigned int ret; + struct iphdr *iph = (*pskb)->nh.iph; + struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl); + + /* SNMP replies and originating SNMP traps get mangled */ + if (udph->source == ntohs(SNMP_PORT) && dir != IP_CT_DIR_REPLY) + return NF_ACCEPT; + if (udph->dest == ntohs(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + /* No NAT? */ + if (!(ct->status & IPS_NAT_MASK)) + return NF_ACCEPT; + + /* + * Make sure the packet length is ok. So far, we were only guaranteed + * to have a valid length IP header plus 8 bytes, which means we have + * enough room for a UDP header. Just verify the UDP length field so we + * can mess around with the payload. + */ + if (ntohs(udph->len) != (*pskb)->len - (iph->ihl << 2)) { + if (net_ratelimit()) + printk(KERN_WARNING "SNMP: dropping malformed packet " + "src=%u.%u.%u.%u dst=%u.%u.%u.%u\n", + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + return NF_DROP; + } + + if (!skb_ip_make_writable(pskb, (*pskb)->len)) + return NF_DROP; + + spin_lock_bh(&snmp_lock); + ret = snmp_translate(ct, ctinfo, pskb); + spin_unlock_bh(&snmp_lock); + return ret; +} + +static struct ip_conntrack_helper snmp_helper = { + .max_expected = 0, + .timeout = 180, + .me = THIS_MODULE, + .help = help, + .name = "snmp", + + .tuple = { .src = { .u = { __constant_htons(SNMP_PORT) } }, + .dst = { .protonum = IPPROTO_UDP }, + }, + .mask = { .src = { .u = { 0xFFFF } }, + .dst = { .protonum = 0xFF }, + }, +}; + +static struct ip_conntrack_helper snmp_trap_helper = { + .max_expected = 0, + .timeout = 180, + .me = THIS_MODULE, + .help = help, + .name = "snmp_trap", + + .tuple = { .src = { .u = { __constant_htons(SNMP_TRAP_PORT) } }, + .dst = { .protonum = IPPROTO_UDP }, + }, + .mask = { .src = { .u = { 0xFFFF } }, + .dst = { .protonum = 0xFF }, + }, +}; + +/***************************************************************************** + * + * Module stuff. + * + *****************************************************************************/ + +static int __init init(void) +{ + int ret = 0; + + ret = ip_conntrack_helper_register(&snmp_helper); + if (ret < 0) + return ret; + ret = ip_conntrack_helper_register(&snmp_trap_helper); + if (ret < 0) { + ip_conntrack_helper_unregister(&snmp_helper); + return ret; + } + return ret; +} + +static void __exit fini(void) +{ + ip_conntrack_helper_unregister(&snmp_helper); + ip_conntrack_helper_unregister(&snmp_trap_helper); +} + +module_init(init); +module_exit(fini); + +module_param(debug, bool, 0600); diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c new file mode 100644 index 000000000000..dec4a74212cd --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -0,0 +1,349 @@ +/* This file contains all the functions required for the standalone + ip_nat module. + + These are not required by the compatibility layer. +*/ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * 23 Apr 2001: Harald Welte <laforge@gnumonks.org> + * - new API and handling of conntrack/nat helpers + * - now capable of multiple expectations for one master + * */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/icmp.h> +#include <linux/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <net/ip.h> +#include <net/checksum.h> +#include <linux/spinlock.h> + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) + +#include <linux/netfilter_ipv4/ip_nat.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_nat_protocol.h> +#include <linux/netfilter_ipv4/ip_nat_core.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +#define HOOKNAME(hooknum) ((hooknum) == NF_IP_POST_ROUTING ? "POST_ROUTING" \ + : ((hooknum) == NF_IP_PRE_ROUTING ? "PRE_ROUTING" \ + : ((hooknum) == NF_IP_LOCAL_OUT ? "LOCAL_OUT" \ + : ((hooknum) == NF_IP_LOCAL_IN ? "LOCAL_IN" \ + : "*ERROR*"))) + +static unsigned int +ip_nat_fn(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + struct ip_nat_info *info; + /* maniptype == SRC for postrouting. */ + enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum); + + /* We never see fragments: conntrack defrags on pre-routing + and local-out, and ip_nat_out protects post-routing. */ + IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off + & htons(IP_MF|IP_OFFSET))); + + (*pskb)->nfcache |= NFC_UNKNOWN; + + /* If we had a hardware checksum before, it's now invalid */ + if ((*pskb)->ip_summed == CHECKSUM_HW) + if (skb_checksum_help(*pskb, (out == NULL))) + return NF_DROP; + + ct = ip_conntrack_get(*pskb, &ctinfo); + /* Can't track? It's not due to stress, or conntrack would + have dropped it. Hence it's the user's responsibilty to + packet filter it out, or implement conntrack/NAT for that + protocol. 8) --RR */ + if (!ct) { + /* Exception: ICMP redirect to new connection (not in + hash table yet). We must not let this through, in + case we're doing NAT to the same network. */ + if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { + struct icmphdr _hdr, *hp; + + hp = skb_header_pointer(*pskb, + (*pskb)->nh.iph->ihl*4, + sizeof(_hdr), &_hdr); + if (hp != NULL && + hp->type == ICMP_REDIRECT) + return NF_DROP; + } + return NF_ACCEPT; + } + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED+IP_CT_IS_REPLY: + if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { + if (!icmp_reply_translation(pskb, ct, maniptype, + CTINFO2DIR(ctinfo))) + return NF_DROP; + else + return NF_ACCEPT; + } + /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + case IP_CT_NEW: + info = &ct->nat.info; + + /* Seen it before? This can happen for loopback, retrans, + or local packets.. */ + if (!ip_nat_initialized(ct, maniptype)) { + unsigned int ret; + + /* LOCAL_IN hook doesn't have a chain! */ + if (hooknum == NF_IP_LOCAL_IN) + ret = alloc_null_binding(ct, info, hooknum); + else + ret = ip_nat_rule_find(pskb, hooknum, + in, out, ct, + info); + + if (ret != NF_ACCEPT) { + return ret; + } + } else + DEBUGP("Already setup manip %s for ct %p\n", + maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", + ct); + break; + + default: + /* ESTABLISHED */ + IP_NF_ASSERT(ctinfo == IP_CT_ESTABLISHED + || ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY)); + info = &ct->nat.info; + } + + IP_NF_ASSERT(info); + return nat_packet(ct, ctinfo, hooknum, pskb); +} + +static unsigned int +ip_nat_in(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + u_int32_t saddr, daddr; + unsigned int ret; + + saddr = (*pskb)->nh.iph->saddr; + daddr = (*pskb)->nh.iph->daddr; + + ret = ip_nat_fn(hooknum, pskb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN + && ((*pskb)->nh.iph->saddr != saddr + || (*pskb)->nh.iph->daddr != daddr)) { + dst_release((*pskb)->dst); + (*pskb)->dst = NULL; + } + return ret; +} + +static unsigned int +ip_nat_out(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) + return NF_ACCEPT; + + /* We can hit fragment here; forwarded packets get + defragmented by connection tracking coming in, then + fragmented (grr) by the forward code. + + In future: If we have nfct != NULL, AND we have NAT + initialized, AND there is no helper, then we can do full + NAPT on the head, and IP-address-only NAT on the rest. + + I'm starting to have nightmares about fragments. */ + + if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + *pskb = ip_ct_gather_frags(*pskb, IP_DEFRAG_NAT_OUT); + + if (!*pskb) + return NF_STOLEN; + } + + return ip_nat_fn(hooknum, pskb, in, out, okfn); +} + +static unsigned int +ip_nat_local_fn(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + u_int32_t saddr, daddr; + unsigned int ret; + + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) + return NF_ACCEPT; + + saddr = (*pskb)->nh.iph->saddr; + daddr = (*pskb)->nh.iph->daddr; + + ret = ip_nat_fn(hooknum, pskb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN + && ((*pskb)->nh.iph->saddr != saddr + || (*pskb)->nh.iph->daddr != daddr)) + return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP; + return ret; +} + +/* We must be after connection tracking and before packet filtering. */ + +/* Before packet filtering, change destination */ +static struct nf_hook_ops ip_nat_in_ops = { + .hook = ip_nat_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_NAT_DST, +}; + +/* After packet filtering, change source */ +static struct nf_hook_ops ip_nat_out_ops = { + .hook = ip_nat_out, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_NAT_SRC, +}; + +/* Before packet filtering, change destination */ +static struct nf_hook_ops ip_nat_local_out_ops = { + .hook = ip_nat_local_fn, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_NAT_DST, +}; + +/* After packet filtering, change source for reply packets of LOCAL_OUT DNAT */ +static struct nf_hook_ops ip_nat_local_in_ops = { + .hook = ip_nat_fn, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_NAT_SRC, +}; + +static int init_or_cleanup(int init) +{ + int ret = 0; + + need_ip_conntrack(); + + if (!init) goto cleanup; + + ret = ip_nat_rule_init(); + if (ret < 0) { + printk("ip_nat_init: can't setup rules.\n"); + goto cleanup_nothing; + } + ret = ip_nat_init(); + if (ret < 0) { + printk("ip_nat_init: can't setup rules.\n"); + goto cleanup_rule_init; + } + ret = nf_register_hook(&ip_nat_in_ops); + if (ret < 0) { + printk("ip_nat_init: can't register in hook.\n"); + goto cleanup_nat; + } + ret = nf_register_hook(&ip_nat_out_ops); + if (ret < 0) { + printk("ip_nat_init: can't register out hook.\n"); + goto cleanup_inops; + } + ret = nf_register_hook(&ip_nat_local_out_ops); + if (ret < 0) { + printk("ip_nat_init: can't register local out hook.\n"); + goto cleanup_outops; + } + ret = nf_register_hook(&ip_nat_local_in_ops); + if (ret < 0) { + printk("ip_nat_init: can't register local in hook.\n"); + goto cleanup_localoutops; + } + return ret; + + cleanup: + nf_unregister_hook(&ip_nat_local_in_ops); + cleanup_localoutops: + nf_unregister_hook(&ip_nat_local_out_ops); + cleanup_outops: + nf_unregister_hook(&ip_nat_out_ops); + cleanup_inops: + nf_unregister_hook(&ip_nat_in_ops); + cleanup_nat: + ip_nat_cleanup(); + cleanup_rule_init: + ip_nat_rule_cleanup(); + cleanup_nothing: + MUST_BE_READ_WRITE_UNLOCKED(&ip_nat_lock); + return ret; +} + +static int __init init(void) +{ + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); + +EXPORT_SYMBOL(ip_nat_setup_info); +EXPORT_SYMBOL(ip_nat_protocol_register); +EXPORT_SYMBOL(ip_nat_protocol_unregister); +EXPORT_SYMBOL(ip_nat_cheat_check); +EXPORT_SYMBOL(ip_nat_mangle_tcp_packet); +EXPORT_SYMBOL(ip_nat_mangle_udp_packet); +EXPORT_SYMBOL(ip_nat_used_tuple); +EXPORT_SYMBOL(ip_nat_follow_master); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c new file mode 100644 index 000000000000..0343e0d64674 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_tftp.c @@ -0,0 +1,70 @@ +/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Version: 0.0.7 + * + * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org> + * - Port to newnat API + * + * This module currently supports DNAT: + * iptables -t nat -A PREROUTING -d x.x.x.x -j DNAT --to-dest x.x.x.y + * + * and SNAT: + * iptables -t nat -A POSTROUTING { -j MASQUERADE , -j SNAT --to-source x.x.x.x } + * + * It has not been tested with + * -j SNAT --to-source x.x.x.x-x.x.x.y since I only have one external ip + * If you do test this please let me know if it works or not. + * + */ + +#include <linux/module.h> +#include <linux/netfilter_ipv4.h> +#include <linux/ip.h> +#include <linux/udp.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_conntrack_tftp.h> +#include <linux/netfilter_ipv4/ip_nat_helper.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/moduleparam.h> + +MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>"); +MODULE_DESCRIPTION("tftp NAT helper"); +MODULE_LICENSE("GPL"); + +static unsigned int help(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack_expect *exp) +{ + exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port; + exp->dir = IP_CT_DIR_REPLY; + exp->expectfn = ip_nat_follow_master; + if (ip_conntrack_expect_related(exp) != 0) { + ip_conntrack_expect_free(exp); + return NF_DROP; + } + return NF_ACCEPT; +} + +static void __exit fini(void) +{ + ip_nat_tftp_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); +} + +static int __init init(void) +{ + BUG_ON(ip_nat_tftp_hook); + ip_nat_tftp_hook = help; + return 0; +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c new file mode 100644 index 000000000000..9e40dffc204f --- /dev/null +++ b/net/ipv4/netfilter/ip_queue.c @@ -0,0 +1,741 @@ +/* + * This is a module which is used for queueing IPv4 packets and + * communicating with userspace via netlink. + * + * (C) 2000-2002 James Morris <jmorris@intercode.com.au> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 2000-03-27: Simplified code (thanks to Andi Kleen for clues). + * 2000-05-20: Fixed notifier problems (following Miguel Freitas' report). + * 2000-06-19: Fixed so nfmark is copied to metadata (reported by Sebastian + * Zander). + * 2000-08-01: Added Nick Williams' MAC support. + * 2002-06-25: Code cleanup. + * 2005-01-10: Added /proc counter for dropped packets; fixed so + * packets aren't delivered to user space if they're going + * to be dropped. + * + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/init.h> +#include <linux/ip.h> +#include <linux/notifier.h> +#include <linux/netdevice.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4/ip_queue.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netlink.h> +#include <linux/spinlock.h> +#include <linux/sysctl.h> +#include <linux/proc_fs.h> +#include <linux/security.h> +#include <net/sock.h> +#include <net/route.h> + +#define IPQ_QMAX_DEFAULT 1024 +#define IPQ_PROC_FS_NAME "ip_queue" +#define NET_IPQ_QMAX 2088 +#define NET_IPQ_QMAX_NAME "ip_queue_maxlen" + +struct ipq_rt_info { + __u8 tos; + __u32 daddr; + __u32 saddr; +}; + +struct ipq_queue_entry { + struct list_head list; + struct nf_info *info; + struct sk_buff *skb; + struct ipq_rt_info rt_info; +}; + +typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); + +static unsigned char copy_mode = IPQ_COPY_NONE; +static unsigned int queue_maxlen = IPQ_QMAX_DEFAULT; +static DEFINE_RWLOCK(queue_lock); +static int peer_pid; +static unsigned int copy_range; +static unsigned int queue_total; +static unsigned int queue_dropped = 0; +static unsigned int queue_user_dropped = 0; +static struct sock *ipqnl; +static LIST_HEAD(queue_list); +static DECLARE_MUTEX(ipqnl_sem); + +static void +ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict) +{ + nf_reinject(entry->skb, entry->info, verdict); + kfree(entry); +} + +static inline void +__ipq_enqueue_entry(struct ipq_queue_entry *entry) +{ + list_add(&entry->list, &queue_list); + queue_total++; +} + +/* + * Find and return a queued entry matched by cmpfn, or return the last + * entry if cmpfn is NULL. + */ +static inline struct ipq_queue_entry * +__ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data) +{ + struct list_head *p; + + list_for_each_prev(p, &queue_list) { + struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p; + + if (!cmpfn || cmpfn(entry, data)) + return entry; + } + return NULL; +} + +static inline void +__ipq_dequeue_entry(struct ipq_queue_entry *entry) +{ + list_del(&entry->list); + queue_total--; +} + +static inline struct ipq_queue_entry * +__ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data) +{ + struct ipq_queue_entry *entry; + + entry = __ipq_find_entry(cmpfn, data); + if (entry == NULL) + return NULL; + + __ipq_dequeue_entry(entry); + return entry; +} + + +static inline void +__ipq_flush(int verdict) +{ + struct ipq_queue_entry *entry; + + while ((entry = __ipq_find_dequeue_entry(NULL, 0))) + ipq_issue_verdict(entry, verdict); +} + +static inline int +__ipq_set_mode(unsigned char mode, unsigned int range) +{ + int status = 0; + + switch(mode) { + case IPQ_COPY_NONE: + case IPQ_COPY_META: + copy_mode = mode; + copy_range = 0; + break; + + case IPQ_COPY_PACKET: + copy_mode = mode; + copy_range = range; + if (copy_range > 0xFFFF) + copy_range = 0xFFFF; + break; + + default: + status = -EINVAL; + + } + return status; +} + +static inline void +__ipq_reset(void) +{ + peer_pid = 0; + net_disable_timestamp(); + __ipq_set_mode(IPQ_COPY_NONE, 0); + __ipq_flush(NF_DROP); +} + +static struct ipq_queue_entry * +ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data) +{ + struct ipq_queue_entry *entry; + + write_lock_bh(&queue_lock); + entry = __ipq_find_dequeue_entry(cmpfn, data); + write_unlock_bh(&queue_lock); + return entry; +} + +static void +ipq_flush(int verdict) +{ + write_lock_bh(&queue_lock); + __ipq_flush(verdict); + write_unlock_bh(&queue_lock); +} + +static struct sk_buff * +ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) +{ + unsigned char *old_tail; + size_t size = 0; + size_t data_len = 0; + struct sk_buff *skb; + struct ipq_packet_msg *pmsg; + struct nlmsghdr *nlh; + + read_lock_bh(&queue_lock); + + switch (copy_mode) { + case IPQ_COPY_META: + case IPQ_COPY_NONE: + size = NLMSG_SPACE(sizeof(*pmsg)); + data_len = 0; + break; + + case IPQ_COPY_PACKET: + if (copy_range == 0 || copy_range > entry->skb->len) + data_len = entry->skb->len; + else + data_len = copy_range; + + size = NLMSG_SPACE(sizeof(*pmsg) + data_len); + break; + + default: + *errp = -EINVAL; + read_unlock_bh(&queue_lock); + return NULL; + } + + read_unlock_bh(&queue_lock); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + goto nlmsg_failure; + + old_tail= skb->tail; + nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); + pmsg = NLMSG_DATA(nlh); + memset(pmsg, 0, sizeof(*pmsg)); + + pmsg->packet_id = (unsigned long )entry; + pmsg->data_len = data_len; + pmsg->timestamp_sec = entry->skb->stamp.tv_sec; + pmsg->timestamp_usec = entry->skb->stamp.tv_usec; + pmsg->mark = entry->skb->nfmark; + pmsg->hook = entry->info->hook; + pmsg->hw_protocol = entry->skb->protocol; + + if (entry->info->indev) + strcpy(pmsg->indev_name, entry->info->indev->name); + else + pmsg->indev_name[0] = '\0'; + + if (entry->info->outdev) + strcpy(pmsg->outdev_name, entry->info->outdev->name); + else + pmsg->outdev_name[0] = '\0'; + + if (entry->info->indev && entry->skb->dev) { + pmsg->hw_type = entry->skb->dev->type; + if (entry->skb->dev->hard_header_parse) + pmsg->hw_addrlen = + entry->skb->dev->hard_header_parse(entry->skb, + pmsg->hw_addr); + } + + if (data_len) + if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len)) + BUG(); + + nlh->nlmsg_len = skb->tail - old_tail; + return skb; + +nlmsg_failure: + if (skb) + kfree_skb(skb); + *errp = -EINVAL; + printk(KERN_ERR "ip_queue: error creating packet message\n"); + return NULL; +} + +static int +ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) +{ + int status = -EINVAL; + struct sk_buff *nskb; + struct ipq_queue_entry *entry; + + if (copy_mode == IPQ_COPY_NONE) + return -EAGAIN; + + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) { + printk(KERN_ERR "ip_queue: OOM in ipq_enqueue_packet()\n"); + return -ENOMEM; + } + + entry->info = info; + entry->skb = skb; + + if (entry->info->hook == NF_IP_LOCAL_OUT) { + struct iphdr *iph = skb->nh.iph; + + entry->rt_info.tos = iph->tos; + entry->rt_info.daddr = iph->daddr; + entry->rt_info.saddr = iph->saddr; + } + + nskb = ipq_build_packet_message(entry, &status); + if (nskb == NULL) + goto err_out_free; + + write_lock_bh(&queue_lock); + + if (!peer_pid) + goto err_out_free_nskb; + + if (queue_total >= queue_maxlen) { + queue_dropped++; + status = -ENOSPC; + if (net_ratelimit()) + printk (KERN_WARNING "ip_queue: full at %d entries, " + "dropping packets(s). Dropped: %d\n", queue_total, + queue_dropped); + goto err_out_free_nskb; + } + + /* netlink_unicast will either free the nskb or attach it to a socket */ + status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT); + if (status < 0) { + queue_user_dropped++; + goto err_out_unlock; + } + + __ipq_enqueue_entry(entry); + + write_unlock_bh(&queue_lock); + return status; + +err_out_free_nskb: + kfree_skb(nskb); + +err_out_unlock: + write_unlock_bh(&queue_lock); + +err_out_free: + kfree(entry); + return status; +} + +static int +ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) +{ + int diff; + struct iphdr *user_iph = (struct iphdr *)v->payload; + + if (v->data_len < sizeof(*user_iph)) + return 0; + diff = v->data_len - e->skb->len; + if (diff < 0) + skb_trim(e->skb, v->data_len); + else if (diff > 0) { + if (v->data_len > 0xFFFF) + return -EINVAL; + if (diff > skb_tailroom(e->skb)) { + struct sk_buff *newskb; + + newskb = skb_copy_expand(e->skb, + skb_headroom(e->skb), + diff, + GFP_ATOMIC); + if (newskb == NULL) { + printk(KERN_WARNING "ip_queue: OOM " + "in mangle, dropping packet\n"); + return -ENOMEM; + } + if (e->skb->sk) + skb_set_owner_w(newskb, e->skb->sk); + kfree_skb(e->skb); + e->skb = newskb; + } + skb_put(e->skb, diff); + } + if (!skb_ip_make_writable(&e->skb, v->data_len)) + return -ENOMEM; + memcpy(e->skb->data, v->payload, v->data_len); + e->skb->nfcache |= NFC_ALTERED; + + /* + * Extra routing may needed on local out, as the QUEUE target never + * returns control to the table. + */ + if (e->info->hook == NF_IP_LOCAL_OUT) { + struct iphdr *iph = e->skb->nh.iph; + + if (!(iph->tos == e->rt_info.tos + && iph->daddr == e->rt_info.daddr + && iph->saddr == e->rt_info.saddr)) + return ip_route_me_harder(&e->skb); + } + return 0; +} + +static inline int +id_cmp(struct ipq_queue_entry *e, unsigned long id) +{ + return (id == (unsigned long )e); +} + +static int +ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) +{ + struct ipq_queue_entry *entry; + + if (vmsg->value > NF_MAX_VERDICT) + return -EINVAL; + + entry = ipq_find_dequeue_entry(id_cmp, vmsg->id); + if (entry == NULL) + return -ENOENT; + else { + int verdict = vmsg->value; + + if (vmsg->data_len && vmsg->data_len == len) + if (ipq_mangle_ipv4(vmsg, entry) < 0) + verdict = NF_DROP; + + ipq_issue_verdict(entry, verdict); + return 0; + } +} + +static int +ipq_set_mode(unsigned char mode, unsigned int range) +{ + int status; + + write_lock_bh(&queue_lock); + status = __ipq_set_mode(mode, range); + write_unlock_bh(&queue_lock); + return status; +} + +static int +ipq_receive_peer(struct ipq_peer_msg *pmsg, + unsigned char type, unsigned int len) +{ + int status = 0; + + if (len < sizeof(*pmsg)) + return -EINVAL; + + switch (type) { + case IPQM_MODE: + status = ipq_set_mode(pmsg->msg.mode.value, + pmsg->msg.mode.range); + break; + + case IPQM_VERDICT: + if (pmsg->msg.verdict.value > NF_MAX_VERDICT) + status = -EINVAL; + else + status = ipq_set_verdict(&pmsg->msg.verdict, + len - sizeof(*pmsg)); + break; + default: + status = -EINVAL; + } + return status; +} + +static int +dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex) +{ + if (entry->info->indev) + if (entry->info->indev->ifindex == ifindex) + return 1; + + if (entry->info->outdev) + if (entry->info->outdev->ifindex == ifindex) + return 1; + + return 0; +} + +static void +ipq_dev_drop(int ifindex) +{ + struct ipq_queue_entry *entry; + + while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL) + ipq_issue_verdict(entry, NF_DROP); +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static inline void +ipq_rcv_skb(struct sk_buff *skb) +{ + int status, type, pid, flags, nlmsglen, skblen; + struct nlmsghdr *nlh; + + skblen = skb->len; + if (skblen < sizeof(*nlh)) + return; + + nlh = (struct nlmsghdr *)skb->data; + nlmsglen = nlh->nlmsg_len; + if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen) + return; + + pid = nlh->nlmsg_pid; + flags = nlh->nlmsg_flags; + + if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI) + RCV_SKB_FAIL(-EINVAL); + + if (flags & MSG_TRUNC) + RCV_SKB_FAIL(-ECOMM); + + type = nlh->nlmsg_type; + if (type < NLMSG_NOOP || type >= IPQM_MAX) + RCV_SKB_FAIL(-EINVAL); + + if (type <= IPQM_BASE) + return; + + if (security_netlink_recv(skb)) + RCV_SKB_FAIL(-EPERM); + + write_lock_bh(&queue_lock); + + if (peer_pid) { + if (peer_pid != pid) { + write_unlock_bh(&queue_lock); + RCV_SKB_FAIL(-EBUSY); + } + } else { + net_enable_timestamp(); + peer_pid = pid; + } + + write_unlock_bh(&queue_lock); + + status = ipq_receive_peer(NLMSG_DATA(nlh), type, + skblen - NLMSG_LENGTH(0)); + if (status < 0) + RCV_SKB_FAIL(status); + + if (flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + return; +} + +static void +ipq_rcv_sk(struct sock *sk, int len) +{ + do { + struct sk_buff *skb; + + if (down_trylock(&ipqnl_sem)) + return; + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + ipq_rcv_skb(skb); + kfree_skb(skb); + } + + up(&ipqnl_sem); + + } while (ipqnl && ipqnl->sk_receive_queue.qlen); +} + +static int +ipq_rcv_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + ipq_dev_drop(dev->ifindex); + return NOTIFY_DONE; +} + +static struct notifier_block ipq_dev_notifier = { + .notifier_call = ipq_rcv_dev_event, +}; + +static int +ipq_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + + if (event == NETLINK_URELEASE && + n->protocol == NETLINK_FIREWALL && n->pid) { + write_lock_bh(&queue_lock); + if (n->pid == peer_pid) + __ipq_reset(); + write_unlock_bh(&queue_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block ipq_nl_notifier = { + .notifier_call = ipq_rcv_nl_event, +}; + +static struct ctl_table_header *ipq_sysctl_header; + +static ctl_table ipq_table[] = { + { + .ctl_name = NET_IPQ_QMAX, + .procname = NET_IPQ_QMAX_NAME, + .data = &queue_maxlen, + .maxlen = sizeof(queue_maxlen), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .ctl_name = 0 } +}; + +static ctl_table ipq_dir_table[] = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = ipq_table + }, + { .ctl_name = 0 } +}; + +static ctl_table ipq_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ipq_dir_table + }, + { .ctl_name = 0 } +}; + +#ifdef CONFIG_PROC_FS +static int +ipq_get_info(char *buffer, char **start, off_t offset, int length) +{ + int len; + + read_lock_bh(&queue_lock); + + len = sprintf(buffer, + "Peer PID : %d\n" + "Copy mode : %hu\n" + "Copy range : %u\n" + "Queue length : %u\n" + "Queue max. length : %u\n" + "Queue dropped : %u\n" + "Netlink dropped : %u\n", + peer_pid, + copy_mode, + copy_range, + queue_total, + queue_maxlen, + queue_dropped, + queue_user_dropped); + + read_unlock_bh(&queue_lock); + + *start = buffer + offset; + len -= offset; + if (len > length) + len = length; + else if (len < 0) + len = 0; + return len; +} +#endif /* CONFIG_PROC_FS */ + +static int +init_or_cleanup(int init) +{ + int status = -ENOMEM; + struct proc_dir_entry *proc; + + if (!init) + goto cleanup; + + netlink_register_notifier(&ipq_nl_notifier); + ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk); + if (ipqnl == NULL) { + printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + + proc = proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info); + if (proc) + proc->owner = THIS_MODULE; + else { + printk(KERN_ERR "ip_queue: failed to create proc entry\n"); + goto cleanup_ipqnl; + } + + register_netdevice_notifier(&ipq_dev_notifier); + ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0); + + status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL); + if (status < 0) { + printk(KERN_ERR "ip_queue: failed to register queue handler\n"); + goto cleanup_sysctl; + } + return status; + +cleanup: + nf_unregister_queue_handler(PF_INET); + synchronize_net(); + ipq_flush(NF_DROP); + +cleanup_sysctl: + unregister_sysctl_table(ipq_sysctl_header); + unregister_netdevice_notifier(&ipq_dev_notifier); + proc_net_remove(IPQ_PROC_FS_NAME); + +cleanup_ipqnl: + sock_release(ipqnl->sk_socket); + down(&ipqnl_sem); + up(&ipqnl_sem); + +cleanup_netlink_notifier: + netlink_unregister_notifier(&ipq_nl_notifier); + return status; +} + +static int __init init(void) +{ + + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +MODULE_DESCRIPTION("IPv4 packet queue handler"); +MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); +MODULE_LICENSE("GPL"); + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c new file mode 100644 index 000000000000..8a54f92b8496 --- /dev/null +++ b/net/ipv4/netfilter/ip_tables.c @@ -0,0 +1,1964 @@ +/* + * Packet matching code. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 19 Jan 2002 Harald Welte <laforge@gnumonks.org> + * - increase module usage count as soon as we have rules inside + * a table + */ +#include <linux/config.h> +#include <linux/cache.h> +#include <linux/skbuff.h> +#include <linux/kmod.h> +#include <linux/vmalloc.h> +#include <linux/netdevice.h> +#include <linux/module.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <net/ip.h> +#include <asm/uaccess.h> +#include <asm/semaphore.h> +#include <linux/proc_fs.h> +#include <linux/err.h> + +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("IPv4 packet filter"); + +/*#define DEBUG_IP_FIREWALL*/ +/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ +/*#define DEBUG_IP_FIREWALL_USER*/ + +#ifdef DEBUG_IP_FIREWALL +#define dprintf(format, args...) printk(format , ## args) +#else +#define dprintf(format, args...) +#endif + +#ifdef DEBUG_IP_FIREWALL_USER +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#ifdef CONFIG_NETFILTER_DEBUG +#define IP_NF_ASSERT(x) \ +do { \ + if (!(x)) \ + printk("IP_NF_ASSERT: %s:%s:%u\n", \ + __FUNCTION__, __FILE__, __LINE__); \ +} while(0) +#else +#define IP_NF_ASSERT(x) +#endif +#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) + +static DECLARE_MUTEX(ipt_mutex); + +/* Must have mutex */ +#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) +#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) +#include <linux/netfilter_ipv4/lockhelp.h> +#include <linux/netfilter_ipv4/listhelp.h> + +#if 0 +/* All the better to debug you with... */ +#define static +#define inline +#endif + +/* + We keep a set of rules for each CPU, so we can avoid write-locking + them in the softirq when updating the counters and therefore + only need to read-lock in the softirq; doing a write_lock_bh() in user + context stops packets coming through and allows user context to read + the counters or update the rules. + + To be cache friendly on SMP, we arrange them like so: + [ n-entries ] + ... cache-align padding ... + [ n-entries ] + + Hence the start of any table is given by get_table() below. */ + +/* The table itself */ +struct ipt_table_info +{ + /* Size per table */ + unsigned int size; + /* Number of entries: FIXME. --RR */ + unsigned int number; + /* Initial number of entries. Needed for module usage count */ + unsigned int initial_entries; + + /* Entry points and underflows */ + unsigned int hook_entry[NF_IP_NUMHOOKS]; + unsigned int underflow[NF_IP_NUMHOOKS]; + + /* ipt_entry tables: one per CPU */ + char entries[0] ____cacheline_aligned; +}; + +static LIST_HEAD(ipt_target); +static LIST_HEAD(ipt_match); +static LIST_HEAD(ipt_tables); +#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) + +#ifdef CONFIG_SMP +#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) +#else +#define TABLE_OFFSET(t,p) 0 +#endif + +#if 0 +#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) +#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) +#define up(x) do { printk("UP:%u:" #x "\n", __LINE__); up(x); } while(0) +#endif + +/* Returns whether matches rule or not. */ +static inline int +ip_packet_match(const struct iphdr *ip, + const char *indev, + const char *outdev, + const struct ipt_ip *ipinfo, + int isfrag) +{ + size_t i; + unsigned long ret; + +#define FWINV(bool,invflg) ((bool) ^ !!(ipinfo->invflags & invflg)) + + if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, + IPT_INV_SRCIP) + || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, + IPT_INV_DSTIP)) { + dprintf("Source or dest mismatch.\n"); + + dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n", + NIPQUAD(ip->saddr), + NIPQUAD(ipinfo->smsk.s_addr), + NIPQUAD(ipinfo->src.s_addr), + ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : ""); + dprintf("DST: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n", + NIPQUAD(ip->daddr), + NIPQUAD(ipinfo->dmsk.s_addr), + NIPQUAD(ipinfo->dst.s_addr), + ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : ""); + return 0; + } + + /* Look for ifname matches; this should unroll nicely. */ + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + ret |= (((const unsigned long *)indev)[i] + ^ ((const unsigned long *)ipinfo->iniface)[i]) + & ((const unsigned long *)ipinfo->iniface_mask)[i]; + } + + if (FWINV(ret != 0, IPT_INV_VIA_IN)) { + dprintf("VIA in mismatch (%s vs %s).%s\n", + indev, ipinfo->iniface, + ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":""); + return 0; + } + + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + ret |= (((const unsigned long *)outdev)[i] + ^ ((const unsigned long *)ipinfo->outiface)[i]) + & ((const unsigned long *)ipinfo->outiface_mask)[i]; + } + + if (FWINV(ret != 0, IPT_INV_VIA_OUT)) { + dprintf("VIA out mismatch (%s vs %s).%s\n", + outdev, ipinfo->outiface, + ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":""); + return 0; + } + + /* Check specific protocol */ + if (ipinfo->proto + && FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { + dprintf("Packet protocol %hi does not match %hi.%s\n", + ip->protocol, ipinfo->proto, + ipinfo->invflags&IPT_INV_PROTO ? " (INV)":""); + return 0; + } + + /* If we have a fragment rule but the packet is not a fragment + * then we return zero */ + if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) { + dprintf("Fragment rule but not fragment.%s\n", + ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : ""); + return 0; + } + + return 1; +} + +static inline int +ip_checkentry(const struct ipt_ip *ip) +{ + if (ip->flags & ~IPT_F_MASK) { + duprintf("Unknown flag bits set: %08X\n", + ip->flags & ~IPT_F_MASK); + return 0; + } + if (ip->invflags & ~IPT_INV_MASK) { + duprintf("Unknown invflag bits set: %08X\n", + ip->invflags & ~IPT_INV_MASK); + return 0; + } + return 1; +} + +static unsigned int +ipt_error(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + if (net_ratelimit()) + printk("ip_tables: error: `%s'\n", (char *)targinfo); + + return NF_DROP; +} + +static inline +int do_match(struct ipt_entry_match *m, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int offset, + int *hotdrop) +{ + /* Stop iteration if it doesn't match */ + if (!m->u.kernel.match->match(skb, in, out, m->data, offset, hotdrop)) + return 1; + else + return 0; +} + +static inline struct ipt_entry * +get_entry(void *base, unsigned int offset) +{ + return (struct ipt_entry *)(base + offset); +} + +/* Returns one of the generic firewall policies, like NF_ACCEPT. */ +unsigned int +ipt_do_table(struct sk_buff **pskb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + struct ipt_table *table, + void *userdata) +{ + static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); + u_int16_t offset; + struct iphdr *ip; + u_int16_t datalen; + int hotdrop = 0; + /* Initializing verdict to NF_DROP keeps gcc happy. */ + unsigned int verdict = NF_DROP; + const char *indev, *outdev; + void *table_base; + struct ipt_entry *e, *back; + + /* Initialization */ + ip = (*pskb)->nh.iph; + datalen = (*pskb)->len - ip->ihl * 4; + indev = in ? in->name : nulldevname; + outdev = out ? out->name : nulldevname; + /* We handle fragments by dealing with the first fragment as + * if it was a normal packet. All other fragments are treated + * normally, except that they will NEVER match rules that ask + * things we don't know, ie. tcp syn flag or ports). If the + * rule is also a fragment-specific rule, non-fragments won't + * match it. */ + offset = ntohs(ip->frag_off) & IP_OFFSET; + + read_lock_bh(&table->lock); + IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + table_base = (void *)table->private->entries + + TABLE_OFFSET(table->private, smp_processor_id()); + e = get_entry(table_base, table->private->hook_entry[hook]); + +#ifdef CONFIG_NETFILTER_DEBUG + /* Check noone else using our table */ + if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac + && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) { + printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n", + smp_processor_id(), + table->name, + &((struct ipt_entry *)table_base)->comefrom, + ((struct ipt_entry *)table_base)->comefrom); + } + ((struct ipt_entry *)table_base)->comefrom = 0x57acc001; +#endif + + /* For return from builtin chain */ + back = get_entry(table_base, table->private->underflow[hook]); + + do { + IP_NF_ASSERT(e); + IP_NF_ASSERT(back); + (*pskb)->nfcache |= e->nfcache; + if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) { + struct ipt_entry_target *t; + + if (IPT_MATCH_ITERATE(e, do_match, + *pskb, in, out, + offset, &hotdrop) != 0) + goto no_match; + + ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1); + + t = ipt_get_target(e); + IP_NF_ASSERT(t->u.kernel.target); + /* Standard target? */ + if (!t->u.kernel.target->target) { + int v; + + v = ((struct ipt_standard_target *)t)->verdict; + if (v < 0) { + /* Pop from stack? */ + if (v != IPT_RETURN) { + verdict = (unsigned)(-v) - 1; + break; + } + e = back; + back = get_entry(table_base, + back->comefrom); + continue; + } + if (table_base + v + != (void *)e + e->next_offset) { + /* Save old back ptr in next entry */ + struct ipt_entry *next + = (void *)e + e->next_offset; + next->comefrom + = (void *)back - table_base; + /* set back pointer to next entry */ + back = next; + } + + e = get_entry(table_base, v); + } else { + /* Targets which reenter must return + abs. verdicts */ +#ifdef CONFIG_NETFILTER_DEBUG + ((struct ipt_entry *)table_base)->comefrom + = 0xeeeeeeec; +#endif + verdict = t->u.kernel.target->target(pskb, + in, out, + hook, + t->data, + userdata); + +#ifdef CONFIG_NETFILTER_DEBUG + if (((struct ipt_entry *)table_base)->comefrom + != 0xeeeeeeec + && verdict == IPT_CONTINUE) { + printk("Target %s reentered!\n", + t->u.kernel.target->name); + verdict = NF_DROP; + } + ((struct ipt_entry *)table_base)->comefrom + = 0x57acc001; +#endif + /* Target might have changed stuff. */ + ip = (*pskb)->nh.iph; + datalen = (*pskb)->len - ip->ihl * 4; + + if (verdict == IPT_CONTINUE) + e = (void *)e + e->next_offset; + else + /* Verdict */ + break; + } + } else { + + no_match: + e = (void *)e + e->next_offset; + } + } while (!hotdrop); + +#ifdef CONFIG_NETFILTER_DEBUG + ((struct ipt_entry *)table_base)->comefrom = 0xdead57ac; +#endif + read_unlock_bh(&table->lock); + +#ifdef DEBUG_ALLOW_ALL + return NF_ACCEPT; +#else + if (hotdrop) + return NF_DROP; + else return verdict; +#endif +} + +/* + * These are weird, but module loading must not be done with mutex + * held (since they will register), and we have to have a single + * function to use try_then_request_module(). + */ + +/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ +static inline struct ipt_table *find_table_lock(const char *name) +{ + struct ipt_table *t; + + if (down_interruptible(&ipt_mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(t, &ipt_tables, list) + if (strcmp(t->name, name) == 0 && try_module_get(t->me)) + return t; + up(&ipt_mutex); + return NULL; +} + +/* Find match, grabs ref. Returns ERR_PTR() on error. */ +static inline struct ipt_match *find_match(const char *name, u8 revision) +{ + struct ipt_match *m; + int err = 0; + + if (down_interruptible(&ipt_mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(m, &ipt_match, list) { + if (strcmp(m->name, name) == 0) { + if (m->revision == revision) { + if (try_module_get(m->me)) { + up(&ipt_mutex); + return m; + } + } else + err = -EPROTOTYPE; /* Found something. */ + } + } + up(&ipt_mutex); + return ERR_PTR(err); +} + +/* Find target, grabs ref. Returns ERR_PTR() on error. */ +static inline struct ipt_target *find_target(const char *name, u8 revision) +{ + struct ipt_target *t; + int err = 0; + + if (down_interruptible(&ipt_mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(t, &ipt_target, list) { + if (strcmp(t->name, name) == 0) { + if (t->revision == revision) { + if (try_module_get(t->me)) { + up(&ipt_mutex); + return t; + } + } else + err = -EPROTOTYPE; /* Found something. */ + } + } + up(&ipt_mutex); + return ERR_PTR(err); +} + +struct ipt_target *ipt_find_target(const char *name, u8 revision) +{ + struct ipt_target *target; + + target = try_then_request_module(find_target(name, revision), + "ipt_%s", name); + if (IS_ERR(target) || !target) + return NULL; + return target; +} + +static int match_revfn(const char *name, u8 revision, int *bestp) +{ + struct ipt_match *m; + int have_rev = 0; + + list_for_each_entry(m, &ipt_match, list) { + if (strcmp(m->name, name) == 0) { + if (m->revision > *bestp) + *bestp = m->revision; + if (m->revision == revision) + have_rev = 1; + } + } + return have_rev; +} + +static int target_revfn(const char *name, u8 revision, int *bestp) +{ + struct ipt_target *t; + int have_rev = 0; + + list_for_each_entry(t, &ipt_target, list) { + if (strcmp(t->name, name) == 0) { + if (t->revision > *bestp) + *bestp = t->revision; + if (t->revision == revision) + have_rev = 1; + } + } + return have_rev; +} + +/* Returns true or false (if no such extension at all) */ +static inline int find_revision(const char *name, u8 revision, + int (*revfn)(const char *, u8, int *), + int *err) +{ + int have_rev, best = -1; + + if (down_interruptible(&ipt_mutex) != 0) { + *err = -EINTR; + return 1; + } + have_rev = revfn(name, revision, &best); + up(&ipt_mutex); + + /* Nothing at all? Return 0 to try loading module. */ + if (best == -1) { + *err = -ENOENT; + return 0; + } + + *err = best; + if (!have_rev) + *err = -EPROTONOSUPPORT; + return 1; +} + + +/* All zeroes == unconditional rule. */ +static inline int +unconditional(const struct ipt_ip *ip) +{ + unsigned int i; + + for (i = 0; i < sizeof(*ip)/sizeof(__u32); i++) + if (((__u32 *)ip)[i]) + return 0; + + return 1; +} + +/* Figures out from what hook each rule can be called: returns 0 if + there are loops. Puts hook bitmask in comefrom. */ +static int +mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) +{ + unsigned int hook; + + /* No recursion; use packet counter to save back ptrs (reset + to 0 as we leave), and comefrom to save source hook bitmask */ + for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) { + unsigned int pos = newinfo->hook_entry[hook]; + struct ipt_entry *e + = (struct ipt_entry *)(newinfo->entries + pos); + + if (!(valid_hooks & (1 << hook))) + continue; + + /* Set initial back pointer. */ + e->counters.pcnt = pos; + + for (;;) { + struct ipt_standard_target *t + = (void *)ipt_get_target(e); + + if (e->comefrom & (1 << NF_IP_NUMHOOKS)) { + printk("iptables: loop hook %u pos %u %08X.\n", + hook, pos, e->comefrom); + return 0; + } + e->comefrom + |= ((1 << hook) | (1 << NF_IP_NUMHOOKS)); + + /* Unconditional return/END. */ + if (e->target_offset == sizeof(struct ipt_entry) + && (strcmp(t->target.u.user.name, + IPT_STANDARD_TARGET) == 0) + && t->verdict < 0 + && unconditional(&e->ip)) { + unsigned int oldpos, size; + + /* Return: backtrack through the last + big jump. */ + do { + e->comefrom ^= (1<<NF_IP_NUMHOOKS); +#ifdef DEBUG_IP_FIREWALL_USER + if (e->comefrom + & (1 << NF_IP_NUMHOOKS)) { + duprintf("Back unset " + "on hook %u " + "rule %u\n", + hook, pos); + } +#endif + oldpos = pos; + pos = e->counters.pcnt; + e->counters.pcnt = 0; + + /* We're at the start. */ + if (pos == oldpos) + goto next; + + e = (struct ipt_entry *) + (newinfo->entries + pos); + } while (oldpos == pos + e->next_offset); + + /* Move along one */ + size = e->next_offset; + e = (struct ipt_entry *) + (newinfo->entries + pos + size); + e->counters.pcnt = pos; + pos += size; + } else { + int newpos = t->verdict; + + if (strcmp(t->target.u.user.name, + IPT_STANDARD_TARGET) == 0 + && newpos >= 0) { + /* This a jump; chase it. */ + duprintf("Jump rule %u -> %u\n", + pos, newpos); + } else { + /* ... this is a fallthru */ + newpos = pos + e->next_offset; + } + e = (struct ipt_entry *) + (newinfo->entries + newpos); + e->counters.pcnt = pos; + pos = newpos; + } + } + next: + duprintf("Finished chain %u\n", hook); + } + return 1; +} + +static inline int +cleanup_match(struct ipt_entry_match *m, unsigned int *i) +{ + if (i && (*i)-- == 0) + return 1; + + if (m->u.kernel.match->destroy) + m->u.kernel.match->destroy(m->data, + m->u.match_size - sizeof(*m)); + module_put(m->u.kernel.match->me); + return 0; +} + +static inline int +standard_check(const struct ipt_entry_target *t, + unsigned int max_offset) +{ + struct ipt_standard_target *targ = (void *)t; + + /* Check standard info. */ + if (t->u.target_size + != IPT_ALIGN(sizeof(struct ipt_standard_target))) { + duprintf("standard_check: target size %u != %u\n", + t->u.target_size, + IPT_ALIGN(sizeof(struct ipt_standard_target))); + return 0; + } + + if (targ->verdict >= 0 + && targ->verdict > max_offset - sizeof(struct ipt_entry)) { + duprintf("ipt_standard_check: bad verdict (%i)\n", + targ->verdict); + return 0; + } + + if (targ->verdict < -NF_MAX_VERDICT - 1) { + duprintf("ipt_standard_check: bad negative verdict (%i)\n", + targ->verdict); + return 0; + } + return 1; +} + +static inline int +check_match(struct ipt_entry_match *m, + const char *name, + const struct ipt_ip *ip, + unsigned int hookmask, + unsigned int *i) +{ + struct ipt_match *match; + + match = try_then_request_module(find_match(m->u.user.name, + m->u.user.revision), + "ipt_%s", m->u.user.name); + if (IS_ERR(match) || !match) { + duprintf("check_match: `%s' not found\n", m->u.user.name); + return match ? PTR_ERR(match) : -ENOENT; + } + m->u.kernel.match = match; + + if (m->u.kernel.match->checkentry + && !m->u.kernel.match->checkentry(name, ip, m->data, + m->u.match_size - sizeof(*m), + hookmask)) { + module_put(m->u.kernel.match->me); + duprintf("ip_tables: check failed for `%s'.\n", + m->u.kernel.match->name); + return -EINVAL; + } + + (*i)++; + return 0; +} + +static struct ipt_target ipt_standard_target; + +static inline int +check_entry(struct ipt_entry *e, const char *name, unsigned int size, + unsigned int *i) +{ + struct ipt_entry_target *t; + struct ipt_target *target; + int ret; + unsigned int j; + + if (!ip_checkentry(&e->ip)) { + duprintf("ip_tables: ip check failed %p %s.\n", e, name); + return -EINVAL; + } + + j = 0; + ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom, &j); + if (ret != 0) + goto cleanup_matches; + + t = ipt_get_target(e); + target = try_then_request_module(find_target(t->u.user.name, + t->u.user.revision), + "ipt_%s", t->u.user.name); + if (IS_ERR(target) || !target) { + duprintf("check_entry: `%s' not found\n", t->u.user.name); + ret = target ? PTR_ERR(target) : -ENOENT; + goto cleanup_matches; + } + t->u.kernel.target = target; + + if (t->u.kernel.target == &ipt_standard_target) { + if (!standard_check(t, size)) { + ret = -EINVAL; + goto cleanup_matches; + } + } else if (t->u.kernel.target->checkentry + && !t->u.kernel.target->checkentry(name, e, t->data, + t->u.target_size + - sizeof(*t), + e->comefrom)) { + module_put(t->u.kernel.target->me); + duprintf("ip_tables: check failed for `%s'.\n", + t->u.kernel.target->name); + ret = -EINVAL; + goto cleanup_matches; + } + + (*i)++; + return 0; + + cleanup_matches: + IPT_MATCH_ITERATE(e, cleanup_match, &j); + return ret; +} + +static inline int +check_entry_size_and_hooks(struct ipt_entry *e, + struct ipt_table_info *newinfo, + unsigned char *base, + unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + unsigned int *i) +{ + unsigned int h; + + if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 + || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { + duprintf("Bad offset %p\n", e); + return -EINVAL; + } + + if (e->next_offset + < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* Check hooks & underflows */ + for (h = 0; h < NF_IP_NUMHOOKS; h++) { + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) + newinfo->underflow[h] = underflows[h]; + } + + /* FIXME: underflows must be unconditional, standard verdicts + < 0 (not IPT_RETURN). --RR */ + + /* Clear counters and comefrom */ + e->counters = ((struct ipt_counters) { 0, 0 }); + e->comefrom = 0; + + (*i)++; + return 0; +} + +static inline int +cleanup_entry(struct ipt_entry *e, unsigned int *i) +{ + struct ipt_entry_target *t; + + if (i && (*i)-- == 0) + return 1; + + /* Cleanup all matches */ + IPT_MATCH_ITERATE(e, cleanup_match, NULL); + t = ipt_get_target(e); + if (t->u.kernel.target->destroy) + t->u.kernel.target->destroy(t->data, + t->u.target_size - sizeof(*t)); + module_put(t->u.kernel.target->me); + return 0; +} + +/* Checks and translates the user-supplied table segment (held in + newinfo) */ +static int +translate_table(const char *name, + unsigned int valid_hooks, + struct ipt_table_info *newinfo, + unsigned int size, + unsigned int number, + const unsigned int *hook_entries, + const unsigned int *underflows) +{ + unsigned int i; + int ret; + + newinfo->size = size; + newinfo->number = number; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_IP_NUMHOOKS; i++) { + newinfo->hook_entry[i] = 0xFFFFFFFF; + newinfo->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_table: size %u\n", newinfo->size); + i = 0; + /* Walk through entries, checking offsets. */ + ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry_size_and_hooks, + newinfo, + newinfo->entries, + newinfo->entries + size, + hook_entries, underflows, &i); + if (ret != 0) + return ret; + + if (i != number) { + duprintf("translate_table: %u not %u entries\n", + i, number); + return -EINVAL; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_IP_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(valid_hooks & (1 << i))) + continue; + if (newinfo->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, hook_entries[i]); + return -EINVAL; + } + if (newinfo->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, underflows[i]); + return -EINVAL; + } + } + + if (!mark_source_chains(newinfo, valid_hooks)) + return -ELOOP; + + /* Finally, each sanity check must pass */ + i = 0; + ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry, name, size, &i); + + if (ret != 0) { + IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + cleanup_entry, &i); + return ret; + } + + /* And one copy for every other CPU */ + for (i = 1; i < num_possible_cpus(); i++) { + memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i, + newinfo->entries, + SMP_ALIGN(newinfo->size)); + } + + return ret; +} + +static struct ipt_table_info * +replace_table(struct ipt_table *table, + unsigned int num_counters, + struct ipt_table_info *newinfo, + int *error) +{ + struct ipt_table_info *oldinfo; + +#ifdef CONFIG_NETFILTER_DEBUG + { + struct ipt_entry *table_base; + unsigned int i; + + for (i = 0; i < num_possible_cpus(); i++) { + table_base = + (void *)newinfo->entries + + TABLE_OFFSET(newinfo, i); + + table_base->comefrom = 0xdead57ac; + } + } +#endif + + /* Do the substitution. */ + write_lock_bh(&table->lock); + /* Check inside lock: is the old number correct? */ + if (num_counters != table->private->number) { + duprintf("num_counters != table->private->number (%u/%u)\n", + num_counters, table->private->number); + write_unlock_bh(&table->lock); + *error = -EAGAIN; + return NULL; + } + oldinfo = table->private; + table->private = newinfo; + newinfo->initial_entries = oldinfo->initial_entries; + write_unlock_bh(&table->lock); + + return oldinfo; +} + +/* Gets counters. */ +static inline int +add_entry_to_counter(const struct ipt_entry *e, + struct ipt_counters total[], + unsigned int *i) +{ + ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + +static void +get_counters(const struct ipt_table_info *t, + struct ipt_counters counters[]) +{ + unsigned int cpu; + unsigned int i; + + for (cpu = 0; cpu < num_possible_cpus(); cpu++) { + i = 0; + IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + t->size, + add_entry_to_counter, + counters, + &i); + } +} + +static int +copy_entries_to_user(unsigned int total_size, + struct ipt_table *table, + void __user *userptr) +{ + unsigned int off, num, countersize; + struct ipt_entry *e; + struct ipt_counters *counters; + int ret = 0; + + /* We need atomic snapshot of counters: rest doesn't change + (other than comefrom, which userspace doesn't care + about). */ + countersize = sizeof(struct ipt_counters) * table->private->number; + counters = vmalloc(countersize); + + if (counters == NULL) + return -ENOMEM; + + /* First, sum counters... */ + memset(counters, 0, countersize); + write_lock_bh(&table->lock); + get_counters(table->private, counters); + write_unlock_bh(&table->lock); + + /* ... then copy entire thing from CPU 0... */ + if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + ret = -EFAULT; + goto free_counters; + } + + /* FIXME: use iterator macros --RR */ + /* ... then go back and fix counters and names */ + for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ + unsigned int i; + struct ipt_entry_match *m; + struct ipt_entry_target *t; + + e = (struct ipt_entry *)(table->private->entries + off); + if (copy_to_user(userptr + off + + offsetof(struct ipt_entry, counters), + &counters[num], + sizeof(counters[num])) != 0) { + ret = -EFAULT; + goto free_counters; + } + + for (i = sizeof(struct ipt_entry); + i < e->target_offset; + i += m->u.match_size) { + m = (void *)e + i; + + if (copy_to_user(userptr + off + i + + offsetof(struct ipt_entry_match, + u.user.name), + m->u.kernel.match->name, + strlen(m->u.kernel.match->name)+1) + != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + t = ipt_get_target(e); + if (copy_to_user(userptr + off + e->target_offset + + offsetof(struct ipt_entry_target, + u.user.name), + t->u.kernel.target->name, + strlen(t->u.kernel.target->name)+1) != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + free_counters: + vfree(counters); + return ret; +} + +static int +get_entries(const struct ipt_get_entries *entries, + struct ipt_get_entries __user *uptr) +{ + int ret; + struct ipt_table *t; + + t = find_table_lock(entries->name); + if (t && !IS_ERR(t)) { + duprintf("t->private->number = %u\n", + t->private->number); + if (entries->size == t->private->size) + ret = copy_entries_to_user(t->private->size, + t, uptr->entrytable); + else { + duprintf("get_entries: I've got %u not %u!\n", + t->private->size, + entries->size); + ret = -EINVAL; + } + module_put(t->me); + up(&ipt_mutex); + } else + ret = t ? PTR_ERR(t) : -ENOENT; + + return ret; +} + +static int +do_replace(void __user *user, unsigned int len) +{ + int ret; + struct ipt_replace tmp; + struct ipt_table *t; + struct ipt_table_info *newinfo, *oldinfo; + struct ipt_counters *counters; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* Hack: Causes ipchains to give correct error msg --RR */ + if (len != sizeof(tmp) + tmp.size) + return -ENOPROTOOPT; + + /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ + if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) + return -ENOMEM; + + newinfo = vmalloc(sizeof(struct ipt_table_info) + + SMP_ALIGN(tmp.size) * num_possible_cpus()); + if (!newinfo) + return -ENOMEM; + + if (copy_from_user(newinfo->entries, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + counters = vmalloc(tmp.num_counters * sizeof(struct ipt_counters)); + if (!counters) { + ret = -ENOMEM; + goto free_newinfo; + } + memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters)); + + ret = translate_table(tmp.name, tmp.valid_hooks, + newinfo, tmp.size, tmp.num_entries, + tmp.hook_entry, tmp.underflow); + if (ret != 0) + goto free_newinfo_counters; + + duprintf("ip_tables: Translated table\n"); + + t = try_then_request_module(find_table_lock(tmp.name), + "iptable_%s", tmp.name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; + goto free_newinfo_counters_untrans; + } + + /* You lied! */ + if (tmp.valid_hooks != t->valid_hooks) { + duprintf("Valid hook crap: %08X vs %08X\n", + tmp.valid_hooks, t->valid_hooks); + ret = -EINVAL; + goto put_module; + } + + oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret); + if (!oldinfo) + goto put_module; + + /* Update module usage count based on number of rules */ + duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", + oldinfo->number, oldinfo->initial_entries, newinfo->number); + if ((oldinfo->number > oldinfo->initial_entries) || + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + if ((oldinfo->number > oldinfo->initial_entries) && + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + + /* Get the old counters. */ + get_counters(oldinfo, counters); + /* Decrease module usage counts and free resource */ + IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); + vfree(oldinfo); + if (copy_to_user(tmp.counters, counters, + sizeof(struct ipt_counters) * tmp.num_counters) != 0) + ret = -EFAULT; + vfree(counters); + up(&ipt_mutex); + return ret; + + put_module: + module_put(t->me); + up(&ipt_mutex); + free_newinfo_counters_untrans: + IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + free_newinfo_counters: + vfree(counters); + free_newinfo: + vfree(newinfo); + return ret; +} + +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static inline int +add_counter_to_entry(struct ipt_entry *e, + const struct ipt_counters addme[], + unsigned int *i) +{ +#if 0 + duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n", + *i, + (long unsigned int)e->counters.pcnt, + (long unsigned int)e->counters.bcnt, + (long unsigned int)addme[*i].pcnt, + (long unsigned int)addme[*i].bcnt); +#endif + + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + +static int +do_add_counters(void __user *user, unsigned int len) +{ + unsigned int i; + struct ipt_counters_info tmp, *paddc; + struct ipt_table *t; + int ret = 0; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters)) + return -EINVAL; + + paddc = vmalloc(len); + if (!paddc) + return -ENOMEM; + + if (copy_from_user(paddc, user, len) != 0) { + ret = -EFAULT; + goto free; + } + + t = find_table_lock(tmp.name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; + goto free; + } + + write_lock_bh(&t->lock); + if (t->private->number != paddc->num_counters) { + ret = -EINVAL; + goto unlock_up_free; + } + + i = 0; + IPT_ENTRY_ITERATE(t->private->entries, + t->private->size, + add_counter_to_entry, + paddc->counters, + &i); + unlock_up_free: + write_unlock_bh(&t->lock); + up(&ipt_mutex); + module_put(t->me); + free: + vfree(paddc); + + return ret; +} + +static int +do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IPT_SO_SET_REPLACE: + ret = do_replace(user, len); + break; + + case IPT_SO_SET_ADD_COUNTERS: + ret = do_add_counters(user, len); + break; + + default: + duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +static int +do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IPT_SO_GET_INFO: { + char name[IPT_TABLE_MAXNAMELEN]; + struct ipt_table *t; + + if (*len != sizeof(struct ipt_getinfo)) { + duprintf("length %u != %u\n", *len, + sizeof(struct ipt_getinfo)); + ret = -EINVAL; + break; + } + + if (copy_from_user(name, user, sizeof(name)) != 0) { + ret = -EFAULT; + break; + } + name[IPT_TABLE_MAXNAMELEN-1] = '\0'; + + t = try_then_request_module(find_table_lock(name), + "iptable_%s", name); + if (t && !IS_ERR(t)) { + struct ipt_getinfo info; + + info.valid_hooks = t->valid_hooks; + memcpy(info.hook_entry, t->private->hook_entry, + sizeof(info.hook_entry)); + memcpy(info.underflow, t->private->underflow, + sizeof(info.underflow)); + info.num_entries = t->private->number; + info.size = t->private->size; + memcpy(info.name, name, sizeof(info.name)); + + if (copy_to_user(user, &info, *len) != 0) + ret = -EFAULT; + else + ret = 0; + up(&ipt_mutex); + module_put(t->me); + } else + ret = t ? PTR_ERR(t) : -ENOENT; + } + break; + + case IPT_SO_GET_ENTRIES: { + struct ipt_get_entries get; + + if (*len < sizeof(get)) { + duprintf("get_entries: %u < %u\n", *len, sizeof(get)); + ret = -EINVAL; + } else if (copy_from_user(&get, user, sizeof(get)) != 0) { + ret = -EFAULT; + } else if (*len != sizeof(struct ipt_get_entries) + get.size) { + duprintf("get_entries: %u != %u\n", *len, + sizeof(struct ipt_get_entries) + get.size); + ret = -EINVAL; + } else + ret = get_entries(&get, user); + break; + } + + case IPT_SO_GET_REVISION_MATCH: + case IPT_SO_GET_REVISION_TARGET: { + struct ipt_get_revision rev; + int (*revfn)(const char *, u8, int *); + + if (*len != sizeof(rev)) { + ret = -EINVAL; + break; + } + if (copy_from_user(&rev, user, sizeof(rev)) != 0) { + ret = -EFAULT; + break; + } + + if (cmd == IPT_SO_GET_REVISION_TARGET) + revfn = target_revfn; + else + revfn = match_revfn; + + try_then_request_module(find_revision(rev.name, rev.revision, + revfn, &ret), + "ipt_%s", rev.name); + break; + } + + default: + duprintf("do_ipt_get_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +/* Registration hooks for targets. */ +int +ipt_register_target(struct ipt_target *target) +{ + int ret; + + ret = down_interruptible(&ipt_mutex); + if (ret != 0) + return ret; + list_add(&target->list, &ipt_target); + up(&ipt_mutex); + return ret; +} + +void +ipt_unregister_target(struct ipt_target *target) +{ + down(&ipt_mutex); + LIST_DELETE(&ipt_target, target); + up(&ipt_mutex); +} + +int +ipt_register_match(struct ipt_match *match) +{ + int ret; + + ret = down_interruptible(&ipt_mutex); + if (ret != 0) + return ret; + + list_add(&match->list, &ipt_match); + up(&ipt_mutex); + + return ret; +} + +void +ipt_unregister_match(struct ipt_match *match) +{ + down(&ipt_mutex); + LIST_DELETE(&ipt_match, match); + up(&ipt_mutex); +} + +int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl) +{ + int ret; + struct ipt_table_info *newinfo; + static struct ipt_table_info bootstrap + = { 0, 0, 0, { 0 }, { 0 }, { } }; + + newinfo = vmalloc(sizeof(struct ipt_table_info) + + SMP_ALIGN(repl->size) * num_possible_cpus()); + if (!newinfo) + return -ENOMEM; + + memcpy(newinfo->entries, repl->entries, repl->size); + + ret = translate_table(table->name, table->valid_hooks, + newinfo, repl->size, + repl->num_entries, + repl->hook_entry, + repl->underflow); + if (ret != 0) { + vfree(newinfo); + return ret; + } + + ret = down_interruptible(&ipt_mutex); + if (ret != 0) { + vfree(newinfo); + return ret; + } + + /* Don't autoload: we'd eat our tail... */ + if (list_named_find(&ipt_tables, table->name)) { + ret = -EEXIST; + goto free_unlock; + } + + /* Simplifies replace_table code. */ + table->private = &bootstrap; + if (!replace_table(table, 0, newinfo, &ret)) + goto free_unlock; + + duprintf("table->private->number = %u\n", + table->private->number); + + /* save number of initial entries */ + table->private->initial_entries = table->private->number; + + rwlock_init(&table->lock); + list_prepend(&ipt_tables, table); + + unlock: + up(&ipt_mutex); + return ret; + + free_unlock: + vfree(newinfo); + goto unlock; +} + +void ipt_unregister_table(struct ipt_table *table) +{ + down(&ipt_mutex); + LIST_DELETE(&ipt_tables, table); + up(&ipt_mutex); + + /* Decrease module usage counts and free resources */ + IPT_ENTRY_ITERATE(table->private->entries, table->private->size, + cleanup_entry, NULL); + vfree(table->private); +} + +/* Returns 1 if the port is matched by the range, 0 otherwise */ +static inline int +port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert) +{ + int ret; + + ret = (port >= min && port <= max) ^ invert; + return ret; +} + +static int +tcp_find_option(u_int8_t option, + const struct sk_buff *skb, + unsigned int optlen, + int invert, + int *hotdrop) +{ + /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ + u_int8_t _opt[60 - sizeof(struct tcphdr)], *op; + unsigned int i; + + duprintf("tcp_match: finding option\n"); + + if (!optlen) + return invert; + + /* If we don't have the whole header, drop packet. */ + op = skb_header_pointer(skb, + skb->nh.iph->ihl*4 + sizeof(struct tcphdr), + optlen, _opt); + if (op == NULL) { + *hotdrop = 1; + return 0; + } + + for (i = 0; i < optlen; ) { + if (op[i] == option) return !invert; + if (op[i] < 2) i++; + else i += op[i+1]?:1; + } + + return invert; +} + +static int +tcp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct tcphdr _tcph, *th; + const struct ipt_tcp *tcpinfo = matchinfo; + + if (offset) { + /* To quote Alan: + + Don't allow a fragment of TCP 8 bytes in. Nobody normal + causes this. Its a cracker trying to break in by doing a + flag overwrite to pass the direction checks. + */ + if (offset == 1) { + duprintf("Dropping evil TCP offset=1 frag.\n"); + *hotdrop = 1; + } + /* Must not be a fragment. */ + return 0; + } + +#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg)) + + th = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil TCP offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + + if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1], + ntohs(th->source), + !!(tcpinfo->invflags & IPT_TCP_INV_SRCPT))) + return 0; + if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1], + ntohs(th->dest), + !!(tcpinfo->invflags & IPT_TCP_INV_DSTPT))) + return 0; + if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask) + == tcpinfo->flg_cmp, + IPT_TCP_INV_FLAGS)) + return 0; + if (tcpinfo->option) { + if (th->doff * 4 < sizeof(_tcph)) { + *hotdrop = 1; + return 0; + } + if (!tcp_find_option(tcpinfo->option, skb, + th->doff*4 - sizeof(_tcph), + tcpinfo->invflags & IPT_TCP_INV_OPTION, + hotdrop)) + return 0; + } + return 1; +} + +/* Called when user tries to insert an entry of this type. */ +static int +tcp_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_tcp *tcpinfo = matchinfo; + + /* Must specify proto == TCP, and no unknown invflags */ + return ip->proto == IPPROTO_TCP + && !(ip->invflags & IPT_INV_PROTO) + && matchsize == IPT_ALIGN(sizeof(struct ipt_tcp)) + && !(tcpinfo->invflags & ~IPT_TCP_INV_MASK); +} + +static int +udp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct udphdr _udph, *uh; + const struct ipt_udp *udpinfo = matchinfo; + + /* Must not be a fragment. */ + if (offset) + return 0; + + uh = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_udph), &_udph); + if (uh == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil UDP tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return port_match(udpinfo->spts[0], udpinfo->spts[1], + ntohs(uh->source), + !!(udpinfo->invflags & IPT_UDP_INV_SRCPT)) + && port_match(udpinfo->dpts[0], udpinfo->dpts[1], + ntohs(uh->dest), + !!(udpinfo->invflags & IPT_UDP_INV_DSTPT)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +udp_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ipt_udp *udpinfo = matchinfo; + + /* Must specify proto == UDP, and no unknown invflags */ + if (ip->proto != IPPROTO_UDP || (ip->invflags & IPT_INV_PROTO)) { + duprintf("ipt_udp: Protocol %u != %u\n", ip->proto, + IPPROTO_UDP); + return 0; + } + if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_udp))) { + duprintf("ipt_udp: matchsize %u != %u\n", + matchinfosize, IPT_ALIGN(sizeof(struct ipt_udp))); + return 0; + } + if (udpinfo->invflags & ~IPT_UDP_INV_MASK) { + duprintf("ipt_udp: unknown flags %X\n", + udpinfo->invflags); + return 0; + } + + return 1; +} + +/* Returns 1 if the type and code is matched by the range, 0 otherwise */ +static inline int +icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, + u_int8_t type, u_int8_t code, + int invert) +{ + return ((test_type == 0xFF) || (type == test_type && code >= min_code && code <= max_code)) + ^ invert; +} + +static int +icmp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct icmphdr _icmph, *ic; + const struct ipt_icmp *icmpinfo = matchinfo; + + /* Must not be a fragment. */ + if (offset) + return 0; + + ic = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_icmph), &_icmph); + if (ic == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("Dropping evil ICMP tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return icmp_type_code_match(icmpinfo->type, + icmpinfo->code[0], + icmpinfo->code[1], + ic->type, ic->code, + !!(icmpinfo->invflags&IPT_ICMP_INV)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +icmp_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_icmp *icmpinfo = matchinfo; + + /* Must specify proto == ICMP, and no unknown invflags */ + return ip->proto == IPPROTO_ICMP + && !(ip->invflags & IPT_INV_PROTO) + && matchsize == IPT_ALIGN(sizeof(struct ipt_icmp)) + && !(icmpinfo->invflags & ~IPT_ICMP_INV); +} + +/* The built-in targets: standard (NULL) and error. */ +static struct ipt_target ipt_standard_target = { + .name = IPT_STANDARD_TARGET, +}; + +static struct ipt_target ipt_error_target = { + .name = IPT_ERROR_TARGET, + .target = ipt_error, +}; + +static struct nf_sockopt_ops ipt_sockopts = { + .pf = PF_INET, + .set_optmin = IPT_BASE_CTL, + .set_optmax = IPT_SO_SET_MAX+1, + .set = do_ipt_set_ctl, + .get_optmin = IPT_BASE_CTL, + .get_optmax = IPT_SO_GET_MAX+1, + .get = do_ipt_get_ctl, +}; + +static struct ipt_match tcp_matchstruct = { + .name = "tcp", + .match = &tcp_match, + .checkentry = &tcp_checkentry, +}; + +static struct ipt_match udp_matchstruct = { + .name = "udp", + .match = &udp_match, + .checkentry = &udp_checkentry, +}; + +static struct ipt_match icmp_matchstruct = { + .name = "icmp", + .match = &icmp_match, + .checkentry = &icmp_checkentry, +}; + +#ifdef CONFIG_PROC_FS +static inline int print_name(const char *i, + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) +{ + if ((*count)++ >= start_offset) { + unsigned int namelen; + + namelen = sprintf(buffer + *pos, "%s\n", + i + sizeof(struct list_head)); + if (*pos + namelen > length) { + /* Stop iterating */ + return 1; + } + *pos += namelen; + } + return 0; +} + +static inline int print_target(const struct ipt_target *t, + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) +{ + if (t == &ipt_standard_target || t == &ipt_error_target) + return 0; + return print_name((char *)t, start_offset, buffer, length, pos, count); +} + +static int ipt_get_tables(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&ipt_mutex) != 0) + return 0; + + LIST_FIND(&ipt_tables, print_name, void *, + offset, buffer, length, &pos, &count); + + up(&ipt_mutex); + + /* `start' hack - see fs/proc/generic.c line ~105 */ + *start=(char *)((unsigned long)count-offset); + return pos; +} + +static int ipt_get_targets(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&ipt_mutex) != 0) + return 0; + + LIST_FIND(&ipt_target, print_target, struct ipt_target *, + offset, buffer, length, &pos, &count); + + up(&ipt_mutex); + + *start = (char *)((unsigned long)count - offset); + return pos; +} + +static int ipt_get_matches(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&ipt_mutex) != 0) + return 0; + + LIST_FIND(&ipt_match, print_name, void *, + offset, buffer, length, &pos, &count); + + up(&ipt_mutex); + + *start = (char *)((unsigned long)count - offset); + return pos; +} + +static struct { char *name; get_info_t *get_info; } ipt_proc_entry[] = +{ { "ip_tables_names", ipt_get_tables }, + { "ip_tables_targets", ipt_get_targets }, + { "ip_tables_matches", ipt_get_matches }, + { NULL, NULL} }; +#endif /*CONFIG_PROC_FS*/ + +static int __init init(void) +{ + int ret; + + /* Noone else will be downing sem now, so we won't sleep */ + down(&ipt_mutex); + list_append(&ipt_target, &ipt_standard_target); + list_append(&ipt_target, &ipt_error_target); + list_append(&ipt_match, &tcp_matchstruct); + list_append(&ipt_match, &udp_matchstruct); + list_append(&ipt_match, &icmp_matchstruct); + up(&ipt_mutex); + + /* Register setsockopt */ + ret = nf_register_sockopt(&ipt_sockopts); + if (ret < 0) { + duprintf("Unable to register sockopts.\n"); + return ret; + } + +#ifdef CONFIG_PROC_FS + { + struct proc_dir_entry *proc; + int i; + + for (i = 0; ipt_proc_entry[i].name; i++) { + proc = proc_net_create(ipt_proc_entry[i].name, 0, + ipt_proc_entry[i].get_info); + if (!proc) { + while (--i >= 0) + proc_net_remove(ipt_proc_entry[i].name); + nf_unregister_sockopt(&ipt_sockopts); + return -ENOMEM; + } + proc->owner = THIS_MODULE; + } + } +#endif + + printk("ip_tables: (C) 2000-2002 Netfilter core team\n"); + return 0; +} + +static void __exit fini(void) +{ + nf_unregister_sockopt(&ipt_sockopts); +#ifdef CONFIG_PROC_FS + { + int i; + for (i = 0; ipt_proc_entry[i].name; i++) + proc_net_remove(ipt_proc_entry[i].name); + } +#endif +} + +EXPORT_SYMBOL(ipt_register_table); +EXPORT_SYMBOL(ipt_unregister_table); +EXPORT_SYMBOL(ipt_register_match); +EXPORT_SYMBOL(ipt_unregister_match); +EXPORT_SYMBOL(ipt_do_table); +EXPORT_SYMBOL(ipt_register_target); +EXPORT_SYMBOL(ipt_unregister_target); +EXPORT_SYMBOL(ipt_find_target); + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c new file mode 100644 index 000000000000..9842e6e23184 --- /dev/null +++ b/net/ipv4/netfilter/ipt_CLASSIFY.c @@ -0,0 +1,92 @@ +/* + * This is a module which is used for setting the skb->priority field + * of an skb for qdisc classification. + */ + +/* (C) 2001-2002 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/checksum.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_CLASSIFY.h> + +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("iptables qdisc classification target module"); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_classify_target_info *clinfo = targinfo; + + if((*pskb)->priority != clinfo->priority) { + (*pskb)->priority = clinfo->priority; + (*pskb)->nfcache |= NFC_ALTERED; + } + + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_classify_target_info))){ + printk(KERN_ERR "CLASSIFY: invalid size (%u != %Zu).\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_classify_target_info))); + return 0; + } + + if (hook_mask & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_FORWARD) | + (1 << NF_IP_POST_ROUTING))) { + printk(KERN_ERR "CLASSIFY: only valid in LOCAL_OUT, FORWARD " + "and POST_ROUTING.\n"); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_ERR "CLASSIFY: can only be called from " + "\"mangle\" table, not \"%s\".\n", + tablename); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_classify_reg = { + .name = "CLASSIFY", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_classify_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_classify_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c new file mode 100644 index 000000000000..0f12e3a3dc73 --- /dev/null +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -0,0 +1,761 @@ +/* Cluster IP hashmark target + * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> + * based on ideas of Fabio Olive Leite <olive@unixforge.org> + * + * Development of this code funded by SuSE Linux AG, http://www.suse.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include <linux/module.h> +#include <linux/config.h> +#include <linux/proc_fs.h> +#include <linux/jhash.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <linux/if_arp.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +#include <net/checksum.h> + +#include <linux/netfilter_arp.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/lockhelp.h> + +#define CLUSTERIP_VERSION "0.6" + +#define DEBUG_CLUSTERIP + +#ifdef DEBUG_CLUSTERIP +#define DEBUGP printk +#else +#define DEBUGP +#endif + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("iptables target for CLUSTERIP"); + +struct clusterip_config { + struct list_head list; /* list of all configs */ + atomic_t refcount; /* reference count */ + + u_int32_t clusterip; /* the IP address */ + u_int8_t clustermac[ETH_ALEN]; /* the MAC address */ + struct net_device *dev; /* device */ + u_int16_t num_total_nodes; /* total number of nodes */ + u_int16_t num_local_nodes; /* number of local nodes */ + u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; /* node number array */ + +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *pde; /* proc dir entry */ +#endif + enum clusterip_hashmode hash_mode; /* which hashing mode */ + u_int32_t hash_initval; /* hash initialization */ +}; + +static LIST_HEAD(clusterip_configs); + +/* clusterip_lock protects the clusterip_configs list _AND_ the configurable + * data within all structurses (num_local_nodes, local_nodes[]) */ +static DECLARE_RWLOCK(clusterip_lock); + +#ifdef CONFIG_PROC_FS +static struct file_operations clusterip_proc_fops; +static struct proc_dir_entry *clusterip_procdir; +#endif + +static inline void +clusterip_config_get(struct clusterip_config *c) { + atomic_inc(&c->refcount); +} + +static inline void +clusterip_config_put(struct clusterip_config *c) { + if (atomic_dec_and_test(&c->refcount)) { + WRITE_LOCK(&clusterip_lock); + list_del(&c->list); + WRITE_UNLOCK(&clusterip_lock); + dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0); + dev_put(c->dev); + kfree(c); + } +} + + +static struct clusterip_config * +__clusterip_config_find(u_int32_t clusterip) +{ + struct list_head *pos; + + MUST_BE_READ_LOCKED(&clusterip_lock); + list_for_each(pos, &clusterip_configs) { + struct clusterip_config *c = list_entry(pos, + struct clusterip_config, list); + if (c->clusterip == clusterip) { + return c; + } + } + + return NULL; +} + +static inline struct clusterip_config * +clusterip_config_find_get(u_int32_t clusterip) +{ + struct clusterip_config *c; + + READ_LOCK(&clusterip_lock); + c = __clusterip_config_find(clusterip); + if (!c) { + READ_UNLOCK(&clusterip_lock); + return NULL; + } + atomic_inc(&c->refcount); + READ_UNLOCK(&clusterip_lock); + + return c; +} + +static struct clusterip_config * +clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip, + struct net_device *dev) +{ + struct clusterip_config *c; + char buffer[16]; + + c = kmalloc(sizeof(*c), GFP_ATOMIC); + if (!c) + return NULL; + + memset(c, 0, sizeof(*c)); + c->dev = dev; + c->clusterip = ip; + memcpy(&c->clustermac, &i->clustermac, ETH_ALEN); + c->num_total_nodes = i->num_total_nodes; + c->num_local_nodes = i->num_local_nodes; + memcpy(&c->local_nodes, &i->local_nodes, sizeof(&c->local_nodes)); + c->hash_mode = i->hash_mode; + c->hash_initval = i->hash_initval; + atomic_set(&c->refcount, 1); + +#ifdef CONFIG_PROC_FS + /* create proc dir entry */ + sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(ip)); + c->pde = create_proc_entry(buffer, S_IWUSR|S_IRUSR, clusterip_procdir); + if (!c->pde) { + kfree(c); + return NULL; + } + c->pde->proc_fops = &clusterip_proc_fops; + c->pde->data = c; +#endif + + WRITE_LOCK(&clusterip_lock); + list_add(&c->list, &clusterip_configs); + WRITE_UNLOCK(&clusterip_lock); + + return c; +} + +static int +clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum) +{ + int i; + + WRITE_LOCK(&clusterip_lock); + + if (c->num_local_nodes >= CLUSTERIP_MAX_NODES + || nodenum > CLUSTERIP_MAX_NODES) { + WRITE_UNLOCK(&clusterip_lock); + return 1; + } + + /* check if we alrady have this number in our array */ + for (i = 0; i < c->num_local_nodes; i++) { + if (c->local_nodes[i] == nodenum) { + WRITE_UNLOCK(&clusterip_lock); + return 1; + } + } + + c->local_nodes[c->num_local_nodes++] = nodenum; + + WRITE_UNLOCK(&clusterip_lock); + return 0; +} + +static int +clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum) +{ + int i; + + WRITE_LOCK(&clusterip_lock); + + if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) { + WRITE_UNLOCK(&clusterip_lock); + return 1; + } + + for (i = 0; i < c->num_local_nodes; i++) { + if (c->local_nodes[i] == nodenum) { + int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1)); + memmove(&c->local_nodes[i], &c->local_nodes[i+1], size); + c->num_local_nodes--; + WRITE_UNLOCK(&clusterip_lock); + return 0; + } + } + + WRITE_UNLOCK(&clusterip_lock); + return 1; +} + +static inline u_int32_t +clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config) +{ + struct iphdr *iph = skb->nh.iph; + unsigned long hashval; + u_int16_t sport, dport; + struct tcphdr *th; + struct udphdr *uh; + struct icmphdr *ih; + + switch (iph->protocol) { + case IPPROTO_TCP: + th = (void *)iph+iph->ihl*4; + sport = ntohs(th->source); + dport = ntohs(th->dest); + break; + case IPPROTO_UDP: + uh = (void *)iph+iph->ihl*4; + sport = ntohs(uh->source); + dport = ntohs(uh->dest); + break; + case IPPROTO_ICMP: + ih = (void *)iph+iph->ihl*4; + sport = ntohs(ih->un.echo.id); + dport = (ih->type<<8)|ih->code; + break; + default: + if (net_ratelimit()) { + printk(KERN_NOTICE "CLUSTERIP: unknown protocol `%u'\n", + iph->protocol); + } + sport = dport = 0; + } + + switch (config->hash_mode) { + case CLUSTERIP_HASHMODE_SIP: + hashval = jhash_1word(ntohl(iph->saddr), + config->hash_initval); + break; + case CLUSTERIP_HASHMODE_SIP_SPT: + hashval = jhash_2words(ntohl(iph->saddr), sport, + config->hash_initval); + break; + case CLUSTERIP_HASHMODE_SIP_SPT_DPT: + hashval = jhash_3words(ntohl(iph->saddr), sport, dport, + config->hash_initval); + break; + default: + /* to make gcc happy */ + hashval = 0; + /* This cannot happen, unless the check function wasn't called + * at rule load time */ + printk("CLUSTERIP: unknown mode `%u'\n", config->hash_mode); + BUG(); + break; + } + + /* node numbers are 1..n, not 0..n */ + return ((hashval % config->num_total_nodes)+1); +} + +static inline int +clusterip_responsible(struct clusterip_config *config, u_int32_t hash) +{ + int i; + + READ_LOCK(&clusterip_lock); + + if (config->num_local_nodes == 0) { + READ_UNLOCK(&clusterip_lock); + return 0; + } + + for (i = 0; i < config->num_local_nodes; i++) { + if (config->local_nodes[i] == hash) { + READ_UNLOCK(&clusterip_lock); + return 1; + } + } + + READ_UNLOCK(&clusterip_lock); + + return 0; +} + +/*********************************************************************** + * IPTABLES TARGET + ***********************************************************************/ + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_clusterip_tgt_info *cipinfo = targinfo; + enum ip_conntrack_info ctinfo; + struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); + u_int32_t hash; + + /* don't need to clusterip_config_get() here, since refcount + * is only decremented by destroy() - and ip_tables guarantees + * that the ->target() function isn't called after ->destroy() */ + + if (!ct) { + printk(KERN_ERR "CLUSTERIP: no conntrack!\n"); + /* FIXME: need to drop invalid ones, since replies + * to outgoing connections of other nodes will be + * marked as INVALID */ + return NF_DROP; + } + + /* special case: ICMP error handling. conntrack distinguishes between + * error messages (RELATED) and information requests (see below) */ + if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP + && (ctinfo == IP_CT_RELATED + || ctinfo == IP_CT_IS_REPLY+IP_CT_IS_REPLY)) + return IPT_CONTINUE; + + /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, + * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here + * on, which all have an ID field [relevant for hashing]. */ + + hash = clusterip_hashfn(*pskb, cipinfo->config); + + switch (ctinfo) { + case IP_CT_NEW: + ct->mark = hash; + break; + case IP_CT_RELATED: + case IP_CT_RELATED+IP_CT_IS_REPLY: + /* FIXME: we don't handle expectations at the + * moment. they can arrive on a different node than + * the master connection (e.g. FTP passive mode) */ + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED+IP_CT_IS_REPLY: + break; + default: + break; + } + +#ifdef DEBUG_CLUSTERP + DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); +#endif + DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark); + if (!clusterip_responsible(cipinfo->config, hash)) { + DEBUGP("not responsible\n"); + return NF_DROP; + } + DEBUGP("responsible\n"); + + /* despite being received via linklayer multicast, this is + * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */ + (*pskb)->pkt_type = PACKET_HOST; + + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ipt_clusterip_tgt_info *cipinfo = targinfo; + + struct clusterip_config *config; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info))) { + printk(KERN_WARNING "CLUSTERIP: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info))); + return 0; + } + + if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && + cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && + cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { + printk(KERN_WARNING "CLUSTERIP: unknown mode `%u'\n", + cipinfo->hash_mode); + return 0; + + } + if (e->ip.dmsk.s_addr != 0xffffffff + || e->ip.dst.s_addr == 0) { + printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n"); + return 0; + } + + /* FIXME: further sanity checks */ + + config = clusterip_config_find_get(e->ip.dst.s_addr); + if (!config) { + if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) { + printk(KERN_WARNING "CLUSTERIP: no config found for %u.%u.%u.%u, need 'new'\n", NIPQUAD(e->ip.dst.s_addr)); + return 0; + } else { + struct net_device *dev; + + if (e->ip.iniface[0] == '\0') { + printk(KERN_WARNING "CLUSTERIP: Please specify an interface name\n"); + return 0; + } + + dev = dev_get_by_name(e->ip.iniface); + if (!dev) { + printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface); + return 0; + } + + config = clusterip_config_init(cipinfo, + e->ip.dst.s_addr, dev); + if (!config) { + printk(KERN_WARNING "CLUSTERIP: cannot allocate config\n"); + dev_put(dev); + return 0; + } + dev_mc_add(config->dev,config->clustermac, ETH_ALEN, 0); + } + } + + cipinfo->config = config; + + return 1; +} + +/* drop reference count of cluster config when rule is deleted */ +static void destroy(void *matchinfo, unsigned int matchinfosize) +{ + struct ipt_clusterip_tgt_info *cipinfo = matchinfo; + + /* we first remove the proc entry and then drop the reference + * count. In case anyone still accesses the file, the open/close + * functions are also incrementing the refcount on their own */ +#ifdef CONFIG_PROC_FS + remove_proc_entry(cipinfo->config->pde->name, + cipinfo->config->pde->parent); +#endif + clusterip_config_put(cipinfo->config); +} + +static struct ipt_target clusterip_tgt = { + .name = "CLUSTERIP", + .target = &target, + .checkentry = &checkentry, + .destroy = &destroy, + .me = THIS_MODULE +}; + + +/*********************************************************************** + * ARP MANGLING CODE + ***********************************************************************/ + +/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */ +struct arp_payload { + u_int8_t src_hw[ETH_ALEN]; + u_int32_t src_ip; + u_int8_t dst_hw[ETH_ALEN]; + u_int32_t dst_ip; +} __attribute__ ((packed)); + +#ifdef CLUSTERIP_DEBUG +static void arp_print(struct arp_payload *payload) +{ +#define HBUFFERLEN 30 + char hbuffer[HBUFFERLEN]; + int j,k; + const char hexbuf[]= "0123456789abcdef"; + + for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) { + hbuffer[k++]=hexbuf[(payload->src_hw[j]>>4)&15]; + hbuffer[k++]=hexbuf[payload->src_hw[j]&15]; + hbuffer[k++]=':'; + } + hbuffer[--k]='\0'; + + printk("src %u.%u.%u.%u@%s, dst %u.%u.%u.%u\n", + NIPQUAD(payload->src_ip), hbuffer, + NIPQUAD(payload->dst_ip)); +} +#endif + +static unsigned int +arp_mangle(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct arphdr *arp = (*pskb)->nh.arph; + struct arp_payload *payload; + struct clusterip_config *c; + + /* we don't care about non-ethernet and non-ipv4 ARP */ + if (arp->ar_hrd != htons(ARPHRD_ETHER) + || arp->ar_pro != htons(ETH_P_IP) + || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) + return NF_ACCEPT; + + /* we only want to mangle arp replies */ + if (arp->ar_op != htons(ARPOP_REPLY)) + return NF_ACCEPT; + + payload = (void *)(arp+1); + + /* if there is no clusterip configuration for the arp reply's + * source ip, we don't want to mangle it */ + c = clusterip_config_find_get(payload->src_ip); + if (!c) + return NF_ACCEPT; + + /* normally the linux kernel always replies to arp queries of + * addresses on different interfacs. However, in the CLUSTERIP case + * this wouldn't work, since we didn't subscribe the mcast group on + * other interfaces */ + if (c->dev != out) { + DEBUGP("CLUSTERIP: not mangling arp reply on different " + "interface: cip'%s'-skb'%s'\n", c->dev->name, out->name); + clusterip_config_put(c); + return NF_ACCEPT; + } + + /* mangle reply hardware address */ + memcpy(payload->src_hw, c->clustermac, arp->ar_hln); + +#ifdef CLUSTERIP_DEBUG + DEBUGP(KERN_DEBUG "CLUSTERIP mangled arp reply: "); + arp_print(payload); +#endif + + clusterip_config_put(c); + + return NF_ACCEPT; +} + +static struct nf_hook_ops cip_arp_ops = { + .hook = arp_mangle, + .pf = NF_ARP, + .hooknum = NF_ARP_OUT, + .priority = -1 +}; + +/*********************************************************************** + * PROC DIR HANDLING + ***********************************************************************/ + +#ifdef CONFIG_PROC_FS + +static void *clusterip_seq_start(struct seq_file *s, loff_t *pos) +{ + struct proc_dir_entry *pde = s->private; + struct clusterip_config *c = pde->data; + unsigned int *nodeidx; + + READ_LOCK(&clusterip_lock); + if (*pos >= c->num_local_nodes) + return NULL; + + nodeidx = kmalloc(sizeof(unsigned int), GFP_KERNEL); + if (!nodeidx) + return ERR_PTR(-ENOMEM); + + *nodeidx = *pos; + return nodeidx; +} + +static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct proc_dir_entry *pde = s->private; + struct clusterip_config *c = pde->data; + unsigned int *nodeidx = (unsigned int *)v; + + *pos = ++(*nodeidx); + if (*pos >= c->num_local_nodes) { + kfree(v); + return NULL; + } + return nodeidx; +} + +static void clusterip_seq_stop(struct seq_file *s, void *v) +{ + kfree(v); + + READ_UNLOCK(&clusterip_lock); +} + +static int clusterip_seq_show(struct seq_file *s, void *v) +{ + struct proc_dir_entry *pde = s->private; + struct clusterip_config *c = pde->data; + unsigned int *nodeidx = (unsigned int *)v; + + if (*nodeidx != 0) + seq_putc(s, ','); + seq_printf(s, "%u", c->local_nodes[*nodeidx]); + + if (*nodeidx == c->num_local_nodes-1) + seq_putc(s, '\n'); + + return 0; +} + +static struct seq_operations clusterip_seq_ops = { + .start = clusterip_seq_start, + .next = clusterip_seq_next, + .stop = clusterip_seq_stop, + .show = clusterip_seq_show, +}; + +static int clusterip_proc_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &clusterip_seq_ops); + + if (!ret) { + struct seq_file *sf = file->private_data; + struct proc_dir_entry *pde = PDE(inode); + struct clusterip_config *c = pde->data; + + sf->private = pde; + + clusterip_config_get(c); + } + + return ret; +} + +static int clusterip_proc_release(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + struct clusterip_config *c = pde->data; + int ret; + + ret = seq_release(inode, file); + + if (!ret) + clusterip_config_put(c); + + return ret; +} + +static ssize_t clusterip_proc_write(struct file *file, const char __user *input, + size_t size, loff_t *ofs) +{ +#define PROC_WRITELEN 10 + char buffer[PROC_WRITELEN+1]; + struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode); + struct clusterip_config *c = pde->data; + unsigned long nodenum; + + if (copy_from_user(buffer, input, PROC_WRITELEN)) + return -EFAULT; + + if (*buffer == '+') { + nodenum = simple_strtoul(buffer+1, NULL, 10); + if (clusterip_add_node(c, nodenum)) + return -ENOMEM; + } else if (*buffer == '-') { + nodenum = simple_strtoul(buffer+1, NULL,10); + if (clusterip_del_node(c, nodenum)) + return -ENOENT; + } else + return -EIO; + + return size; +} + +static struct file_operations clusterip_proc_fops = { + .owner = THIS_MODULE, + .open = clusterip_proc_open, + .read = seq_read, + .write = clusterip_proc_write, + .llseek = seq_lseek, + .release = clusterip_proc_release, +}; + +#endif /* CONFIG_PROC_FS */ + +static int init_or_cleanup(int fini) +{ + int ret; + + if (fini) + goto cleanup; + + if (ipt_register_target(&clusterip_tgt)) { + ret = -EINVAL; + goto cleanup_none; + } + + if (nf_register_hook(&cip_arp_ops) < 0) { + ret = -EINVAL; + goto cleanup_target; + } + +#ifdef CONFIG_PROC_FS + clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", proc_net); + if (!clusterip_procdir) { + printk(KERN_ERR "CLUSTERIP: Unable to proc dir entry\n"); + ret = -ENOMEM; + goto cleanup_hook; + } +#endif /* CONFIG_PROC_FS */ + + printk(KERN_NOTICE "ClusterIP Version %s loaded successfully\n", + CLUSTERIP_VERSION); + + return 0; + +cleanup: + printk(KERN_NOTICE "ClusterIP Version %s unloading\n", + CLUSTERIP_VERSION); +#ifdef CONFIG_PROC_FS + remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent); +#endif +cleanup_hook: + nf_unregister_hook(&cip_arp_ops); +cleanup_target: + ipt_unregister_target(&clusterip_tgt); +cleanup_none: + return -EINVAL; +} + +static int __init init(void) +{ + return init_or_cleanup(0); +} + +static void __exit fini(void) +{ + init_or_cleanup(1); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c new file mode 100644 index 000000000000..30ddd3e18eb7 --- /dev/null +++ b/net/ipv4/netfilter/ipt_CONNMARK.c @@ -0,0 +1,118 @@ +/* This kernel module is used to modify the connection mark values, or + * to optionally restore the skb nfmark from the connection mark + * + * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> + * by Henrik Nordstrom <hno@marasystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/checksum.h> + +MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>"); +MODULE_DESCRIPTION("IP tables CONNMARK matching module"); +MODULE_LICENSE("GPL"); + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_CONNMARK.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_connmark_target_info *markinfo = targinfo; + unsigned long diff; + unsigned long nfmark; + unsigned long newmark; + + enum ip_conntrack_info ctinfo; + struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); + if (ct) { + switch(markinfo->mode) { + case IPT_CONNMARK_SET: + newmark = (ct->mark & ~markinfo->mask) | markinfo->mark; + if (newmark != ct->mark) + ct->mark = newmark; + break; + case IPT_CONNMARK_SAVE: + newmark = (ct->mark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask); + if (ct->mark != newmark) + ct->mark = newmark; + break; + case IPT_CONNMARK_RESTORE: + nfmark = (*pskb)->nfmark; + diff = (ct->mark ^ nfmark) & markinfo->mask; + if (diff != 0) { + (*pskb)->nfmark = nfmark ^ diff; + (*pskb)->nfcache |= NFC_ALTERED; + } + break; + } + } + + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ipt_connmark_target_info *matchinfo = targinfo; + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_connmark_target_info))) { + printk(KERN_WARNING "CONNMARK: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_connmark_target_info))); + return 0; + } + + if (matchinfo->mode == IPT_CONNMARK_RESTORE) { + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "CONNMARK: restore can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + } + + return 1; +} + +static struct ipt_target ipt_connmark_reg = { + .name = "CONNMARK", + .target = &target, + .checkentry = &checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_connmark_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_connmark_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c new file mode 100644 index 000000000000..3ea4509099f9 --- /dev/null +++ b/net/ipv4/netfilter/ipt_DSCP.c @@ -0,0 +1,106 @@ +/* iptables module for setting the IPv4 DSCP field, Version 1.8 + * + * (C) 2002 by Harald Welte <laforge@netfilter.org> + * based on ipt_FTOS.c (C) 2000 by Matthew G. Marsh <mgm@paktronix.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * See RFC2474 for a description of the DSCP field within the IP Header. + * + * ipt_DSCP.c,v 1.8 2002/08/06 18:41:57 laforge Exp +*/ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/checksum.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_DSCP.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("iptables DSCP modification module"); +MODULE_LICENSE("GPL"); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_DSCP_info *dinfo = targinfo; + u_int8_t sh_dscp = ((dinfo->dscp << IPT_DSCP_SHIFT) & IPT_DSCP_MASK); + + + if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) { + u_int16_t diffs[2]; + + if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + return NF_DROP; + + diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; + (*pskb)->nh.iph->tos = ((*pskb)->nh.iph->tos & ~IPT_DSCP_MASK) + | sh_dscp; + diffs[1] = htons((*pskb)->nh.iph->tos); + (*pskb)->nh.iph->check + = csum_fold(csum_partial((char *)diffs, + sizeof(diffs), + (*pskb)->nh.iph->check + ^ 0xFFFF)); + (*pskb)->nfcache |= NFC_ALTERED; + } + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const u_int8_t dscp = ((struct ipt_DSCP_info *)targinfo)->dscp; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_DSCP_info))) { + printk(KERN_WARNING "DSCP: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_DSCP_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "DSCP: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if ((dscp > IPT_DSCP_MAX)) { + printk(KERN_WARNING "DSCP: dscp %x out of range\n", dscp); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_dscp_reg = { + .name = "DSCP", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_dscp_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_dscp_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c new file mode 100644 index 000000000000..ada9911118e9 --- /dev/null +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -0,0 +1,175 @@ +/* iptables module for the IPv4 and TCP ECN bits, Version 1.5 + * + * (C) 2002 by Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * ipt_ECN.c,v 1.5 2002/08/18 19:36:51 laforge Exp +*/ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/tcp.h> +#include <net/checksum.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_ECN.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("iptables ECN modification module"); + +/* set ECT codepoint from IP header. + * return 0 if there was an error. */ +static inline int +set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) +{ + if (((*pskb)->nh.iph->tos & IPT_ECN_IP_MASK) + != (einfo->ip_ect & IPT_ECN_IP_MASK)) { + u_int16_t diffs[2]; + + if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + return 0; + + diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; + (*pskb)->nh.iph->tos &= ~IPT_ECN_IP_MASK; + (*pskb)->nh.iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK); + diffs[1] = htons((*pskb)->nh.iph->tos); + (*pskb)->nh.iph->check + = csum_fold(csum_partial((char *)diffs, + sizeof(diffs), + (*pskb)->nh.iph->check + ^0xFFFF)); + (*pskb)->nfcache |= NFC_ALTERED; + } + return 1; +} + +/* Return 0 if there was an error. */ +static inline int +set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward) +{ + struct tcphdr _tcph, *tcph; + u_int16_t diffs[2]; + + /* Not enought header? */ + tcph = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (!tcph) + return 0; + + if (!(einfo->operation & IPT_ECN_OP_SET_ECE + || tcph->ece == einfo->proto.tcp.ece) + && (!(einfo->operation & IPT_ECN_OP_SET_CWR + || tcph->cwr == einfo->proto.tcp.cwr))) + return 1; + + if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) + return 0; + tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4; + + diffs[0] = ((u_int16_t *)tcph)[6]; + if (einfo->operation & IPT_ECN_OP_SET_ECE) + tcph->ece = einfo->proto.tcp.ece; + if (einfo->operation & IPT_ECN_OP_SET_CWR) + tcph->cwr = einfo->proto.tcp.cwr; + diffs[1] = ((u_int16_t *)tcph)[6]; + diffs[0] = diffs[0] ^ 0xFFFF; + + if ((*pskb)->ip_summed != CHECKSUM_HW) + tcph->check = csum_fold(csum_partial((char *)diffs, + sizeof(diffs), + tcph->check^0xFFFF)); + else + if (skb_checksum_help(*pskb, inward)) + return 0; + (*pskb)->nfcache |= NFC_ALTERED; + return 1; +} + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_ECN_info *einfo = targinfo; + + if (einfo->operation & IPT_ECN_OP_SET_IP) + if (!set_ect_ip(pskb, einfo)) + return NF_DROP; + + if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) + && (*pskb)->nh.iph->protocol == IPPROTO_TCP) + if (!set_ect_tcp(pskb, einfo, (out == NULL))) + return NF_DROP; + + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ipt_ECN_info *einfo = (struct ipt_ECN_info *)targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_ECN_info))) { + printk(KERN_WARNING "ECN: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_ECN_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "ECN: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if (einfo->operation & IPT_ECN_OP_MASK) { + printk(KERN_WARNING "ECN: unsupported ECN operation %x\n", + einfo->operation); + return 0; + } + if (einfo->ip_ect & ~IPT_ECN_IP_MASK) { + printk(KERN_WARNING "ECN: new ECT codepoint %x out of mask\n", + einfo->ip_ect); + return 0; + } + + if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) + && (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & IPT_INV_PROTO))) { + printk(KERN_WARNING "ECN: cannot use TCP operations on a " + "non-tcp rule\n"); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_ecn_reg = { + .name = "ECN", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_ecn_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_ecn_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c new file mode 100644 index 000000000000..ef08733d26da --- /dev/null +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -0,0 +1,485 @@ +/* + * This is a module which is used for logging packets. + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/icmp.h> +#include <net/udp.h> +#include <net/tcp.h> +#include <net/route.h> + +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_LOG.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("iptables syslog logging module"); + +static unsigned int nflog = 1; +module_param(nflog, int, 0400); +MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Use lock to serialize, so printks don't overlap */ +static DEFINE_SPINLOCK(log_lock); + +/* One level of recursion won't kill us */ +static void dump_packet(const struct ipt_log_info *info, + const struct sk_buff *skb, + unsigned int iphoff) +{ + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); + if (ih == NULL) { + printk("TRUNCATED"); + return; + } + + /* Important fields: + * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ + /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ + printk("SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ", + NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); + + /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ + printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); + + /* Max length: 6 "CE DF MF " */ + if (ntohs(ih->frag_off) & IP_CE) + printk("CE "); + if (ntohs(ih->frag_off) & IP_DF) + printk("DF "); + if (ntohs(ih->frag_off) & IP_MF) + printk("MF "); + + /* Max length: 11 "FRAG:65535 " */ + if (ntohs(ih->frag_off) & IP_OFFSET) + printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + + if ((info->logflags & IPT_LOG_IPOPT) + && ih->ihl * 4 > sizeof(struct iphdr)) { + unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op; + unsigned int i, optsize; + + optsize = ih->ihl * 4 - sizeof(struct iphdr); + op = skb_header_pointer(skb, iphoff+sizeof(_iph), + optsize, _opt); + if (op == NULL) { + printk("TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + printk("OPT ("); + for (i = 0; i < optsize; i++) + printk("%02X", op[i]); + printk(") "); + } + + switch (ih->protocol) { + case IPPROTO_TCP: { + struct tcphdr _tcph, *th; + + /* Max length: 10 "PROTO=TCP " */ + printk("PROTO=TCP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + th = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) { + printk("INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + printk("SPT=%u DPT=%u ", + ntohs(th->source), ntohs(th->dest)); + /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ + if (info->logflags & IPT_LOG_TCPSEQ) + printk("SEQ=%u ACK=%u ", + ntohl(th->seq), ntohl(th->ack_seq)); + /* Max length: 13 "WINDOW=65535 " */ + printk("WINDOW=%u ", ntohs(th->window)); + /* Max length: 9 "RES=0x3F " */ + printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); + /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->cwr) + printk("CWR "); + if (th->ece) + printk("ECE "); + if (th->urg) + printk("URG "); + if (th->ack) + printk("ACK "); + if (th->psh) + printk("PSH "); + if (th->rst) + printk("RST "); + if (th->syn) + printk("SYN "); + if (th->fin) + printk("FIN "); + /* Max length: 11 "URGP=65535 " */ + printk("URGP=%u ", ntohs(th->urg_ptr)); + + if ((info->logflags & IPT_LOG_TCPOPT) + && th->doff * 4 > sizeof(struct tcphdr)) { + unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; + unsigned char *op; + unsigned int i, optsize; + + optsize = th->doff * 4 - sizeof(struct tcphdr); + op = skb_header_pointer(skb, + iphoff+ih->ihl*4+sizeof(_tcph), + optsize, _opt); + if (op == NULL) { + printk("TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + printk("OPT ("); + for (i = 0; i < optsize; i++) + printk("%02X", op[i]); + printk(") "); + } + break; + } + case IPPROTO_UDP: { + struct udphdr _udph, *uh; + + /* Max length: 10 "PROTO=UDP " */ + printk("PROTO=UDP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + uh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_udph), &_udph); + if (uh == NULL) { + printk("INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + printk("SPT=%u DPT=%u LEN=%u ", + ntohs(uh->source), ntohs(uh->dest), + ntohs(uh->len)); + break; + } + case IPPROTO_ICMP: { + struct icmphdr _icmph, *ich; + static size_t required_len[NR_ICMP_TYPES+1] + = { [ICMP_ECHOREPLY] = 4, + [ICMP_DEST_UNREACH] + = 8 + sizeof(struct iphdr), + [ICMP_SOURCE_QUENCH] + = 8 + sizeof(struct iphdr), + [ICMP_REDIRECT] + = 8 + sizeof(struct iphdr), + [ICMP_ECHO] = 4, + [ICMP_TIME_EXCEEDED] + = 8 + sizeof(struct iphdr), + [ICMP_PARAMETERPROB] + = 8 + sizeof(struct iphdr), + [ICMP_TIMESTAMP] = 20, + [ICMP_TIMESTAMPREPLY] = 20, + [ICMP_ADDRESS] = 12, + [ICMP_ADDRESSREPLY] = 12 }; + + /* Max length: 11 "PROTO=ICMP " */ + printk("PROTO=ICMP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_icmph), &_icmph); + if (ich == NULL) { + printk("INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + printk("TYPE=%u CODE=%u ", ich->type, ich->code); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (ich->type <= NR_ICMP_TYPES + && required_len[ich->type] + && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { + printk("INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + switch (ich->type) { + case ICMP_ECHOREPLY: + case ICMP_ECHO: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + printk("ID=%u SEQ=%u ", + ntohs(ich->un.echo.id), + ntohs(ich->un.echo.sequence)); + break; + + case ICMP_PARAMETERPROB: + /* Max length: 14 "PARAMETER=255 " */ + printk("PARAMETER=%u ", + ntohl(ich->un.gateway) >> 24); + break; + case ICMP_REDIRECT: + /* Max length: 24 "GATEWAY=255.255.255.255 " */ + printk("GATEWAY=%u.%u.%u.%u ", + NIPQUAD(ich->un.gateway)); + /* Fall through */ + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_TIME_EXCEEDED: + /* Max length: 3+maxlen */ + if (!iphoff) { /* Only recurse once. */ + printk("["); + dump_packet(info, skb, + iphoff + ih->ihl*4+sizeof(_icmph)); + printk("] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ich->type == ICMP_DEST_UNREACH + && ich->code == ICMP_FRAG_NEEDED) + printk("MTU=%u ", ntohs(ich->un.frag.mtu)); + } + break; + } + /* Max Length */ + case IPPROTO_AH: { + struct ip_auth_hdr _ahdr, *ah; + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 9 "PROTO=AH " */ + printk("PROTO=AH "); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ah = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_ahdr), &_ahdr); + if (ah == NULL) { + printk("INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + printk("SPI=0x%x ", ntohl(ah->spi)); + break; + } + case IPPROTO_ESP: { + struct ip_esp_hdr _esph, *eh; + + /* Max length: 10 "PROTO=ESP " */ + printk("PROTO=ESP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + eh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_esph), &_esph); + if (eh == NULL) { + printk("INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + printk("SPI=0x%x ", ntohl(eh->spi)); + break; + } + /* Max length: 10 "PROTO 255 " */ + default: + printk("PROTO=%u ", ih->protocol); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) { + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket && skb->sk->sk_socket->file) + printk("UID=%u ", skb->sk->sk_socket->file->f_uid); + read_unlock_bh(&skb->sk->sk_callback_lock); + } + + /* Proto Max log string length */ + /* IP: 40+46+6+11+127 = 230 */ + /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ + /* UDP: 10+max(25,20) = 35 */ + /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ + /* ESP: 10+max(25)+15 = 50 */ + /* AH: 9+max(25)+15 = 49 */ + /* unknown: 10 */ + + /* (ICMP allows recursion one level deep) */ + /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ + /* maxlen = 230+ 91 + 230 + 252 = 803 */ +} + +static void +ipt_log_packet(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct ipt_log_info *loginfo, + const char *level_string, + const char *prefix) +{ + spin_lock_bh(&log_lock); + printk(level_string); + printk("%sIN=%s OUT=%s ", + prefix == NULL ? loginfo->prefix : prefix, + in ? in->name : "", + out ? out->name : ""); +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) { + struct net_device *physindev = skb->nf_bridge->physindev; + struct net_device *physoutdev = skb->nf_bridge->physoutdev; + + if (physindev && in != physindev) + printk("PHYSIN=%s ", physindev->name); + if (physoutdev && out != physoutdev) + printk("PHYSOUT=%s ", physoutdev->name); + } +#endif + + if (in && !out) { + /* MAC logging for input chain only. */ + printk("MAC="); + if (skb->dev && skb->dev->hard_header_len + && skb->mac.raw != (void*)skb->nh.iph) { + int i; + unsigned char *p = skb->mac.raw; + for (i = 0; i < skb->dev->hard_header_len; i++,p++) + printk("%02x%c", *p, + i==skb->dev->hard_header_len - 1 + ? ' ':':'); + } else + printk(" "); + } + + dump_packet(loginfo, skb, 0); + printk("\n"); + spin_unlock_bh(&log_lock); +} + +static unsigned int +ipt_log_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_log_info *loginfo = targinfo; + char level_string[4] = "< >"; + + level_string[1] = '0' + (loginfo->level % 8); + ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL); + + return IPT_CONTINUE; +} + +static void +ipt_logfn(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const char *prefix) +{ + struct ipt_log_info loginfo = { + .level = 0, + .logflags = IPT_LOG_MASK, + .prefix = "" + }; + + ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix); +} + +static int ipt_log_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ipt_log_info *loginfo = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_log_info))) { + DEBUGP("LOG: targinfosize %u != %u\n", + targinfosize, IPT_ALIGN(sizeof(struct ipt_log_info))); + return 0; + } + + if (loginfo->level >= 8) { + DEBUGP("LOG: level %u >= 8\n", loginfo->level); + return 0; + } + + if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { + DEBUGP("LOG: prefix term %i\n", + loginfo->prefix[sizeof(loginfo->prefix)-1]); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_log_reg = { + .name = "LOG", + .target = ipt_log_target, + .checkentry = ipt_log_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + if (ipt_register_target(&ipt_log_reg)) + return -EINVAL; + if (nflog) + nf_log_register(PF_INET, &ipt_logfn); + + return 0; +} + +static void __exit fini(void) +{ + if (nflog) + nf_log_unregister(PF_INET, &ipt_logfn); + ipt_unregister_target(&ipt_log_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c new file mode 100644 index 000000000000..33c6f9b63b8d --- /dev/null +++ b/net/ipv4/netfilter/ipt_MARK.c @@ -0,0 +1,162 @@ +/* This is a module which is used for setting the NFMARK field of an skb. */ + +/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/checksum.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_MARK.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +MODULE_DESCRIPTION("iptables MARK modification module"); + +static unsigned int +target_v0(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_mark_target_info *markinfo = targinfo; + + if((*pskb)->nfmark != markinfo->mark) { + (*pskb)->nfmark = markinfo->mark; + (*pskb)->nfcache |= NFC_ALTERED; + } + return IPT_CONTINUE; +} + +static unsigned int +target_v1(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_mark_target_info_v1 *markinfo = targinfo; + int mark = 0; + + switch (markinfo->mode) { + case IPT_MARK_SET: + mark = markinfo->mark; + break; + + case IPT_MARK_AND: + mark = (*pskb)->nfmark & markinfo->mark; + break; + + case IPT_MARK_OR: + mark = (*pskb)->nfmark | markinfo->mark; + break; + } + + if((*pskb)->nfmark != mark) { + (*pskb)->nfmark = mark; + (*pskb)->nfcache |= NFC_ALTERED; + } + return IPT_CONTINUE; +} + + +static int +checkentry_v0(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) { + printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_mark_target_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + return 1; +} + +static int +checkentry_v1(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ipt_mark_target_info_v1 *markinfo = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info_v1))){ + printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_mark_target_info_v1))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if (markinfo->mode != IPT_MARK_SET + && markinfo->mode != IPT_MARK_AND + && markinfo->mode != IPT_MARK_OR) { + printk(KERN_WARNING "MARK: unknown mode %u\n", + markinfo->mode); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_mark_reg_v0 = { + .name = "MARK", + .target = target_v0, + .checkentry = checkentry_v0, + .me = THIS_MODULE, + .revision = 0, +}; + +static struct ipt_target ipt_mark_reg_v1 = { + .name = "MARK", + .target = target_v1, + .checkentry = checkentry_v1, + .me = THIS_MODULE, + .revision = 1, +}; + +static int __init init(void) +{ + int err; + + err = ipt_register_target(&ipt_mark_reg_v0); + if (!err) { + err = ipt_register_target(&ipt_mark_reg_v1); + if (err) + ipt_unregister_target(&ipt_mark_reg_v0); + } + return err; +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_mark_reg_v0); + ipt_unregister_target(&ipt_mark_reg_v1); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c new file mode 100644 index 000000000000..57e9f6cf1c36 --- /dev/null +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -0,0 +1,207 @@ +/* Masquerade. Simple mapping which alters range to a local IP address + (depending on route). */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/config.h> +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <net/protocol.h> +#include <net/ip.h> +#include <net/checksum.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("iptables MASQUERADE target module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Lock protects masq region inside conntrack */ +static DECLARE_RWLOCK(masq_lock); + +/* FIXME: Multiple targets. --RR */ +static int +masquerade_check(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ip_nat_multi_range_compat *mr = targinfo; + + if (strcmp(tablename, "nat") != 0) { + DEBUGP("masquerade_check: bad table `%s'.\n", tablename); + return 0; + } + if (targinfosize != IPT_ALIGN(sizeof(*mr))) { + DEBUGP("masquerade_check: size %u != %u.\n", + targinfosize, sizeof(*mr)); + return 0; + } + if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) { + DEBUGP("masquerade_check: bad hooks %x.\n", hook_mask); + return 0; + } + if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { + DEBUGP("masquerade_check: bad MAP_IPS.\n"); + return 0; + } + if (mr->rangesize != 1) { + DEBUGP("masquerade_check: bad rangesize %u.\n", mr->rangesize); + return 0; + } + return 1; +} + +static unsigned int +masquerade_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + const struct ip_nat_multi_range_compat *mr; + struct ip_nat_range newrange; + struct rtable *rt; + u_int32_t newsrc; + + IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); + + /* FIXME: For the moment, don't do local packets, breaks + testsuite for 2.3.49 --RR */ + if ((*pskb)->sk) + return NF_ACCEPT; + + ct = ip_conntrack_get(*pskb, &ctinfo); + IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED + || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); + + mr = targinfo; + rt = (struct rtable *)(*pskb)->dst; + newsrc = inet_select_addr(out, rt->rt_gateway, RT_SCOPE_UNIVERSE); + if (!newsrc) { + printk("MASQUERADE: %s ate my IP address\n", out->name); + return NF_DROP; + } + + WRITE_LOCK(&masq_lock); + ct->nat.masq_index = out->ifindex; + WRITE_UNLOCK(&masq_lock); + + /* Transfer from original range. */ + newrange = ((struct ip_nat_range) + { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, + newsrc, newsrc, + mr->range[0].min, mr->range[0].max }); + + /* Hand modified range to generic setup. */ + return ip_nat_setup_info(ct, &newrange, hooknum); +} + +static inline int +device_cmp(struct ip_conntrack *i, void *ifindex) +{ + int ret; + + READ_LOCK(&masq_lock); + ret = (i->nat.masq_index == (int)(long)ifindex); + READ_UNLOCK(&masq_lock); + + return ret; +} + +static int masq_device_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + + if (event == NETDEV_DOWN) { + /* Device was downed. Search entire table for + conntracks which were associated with that device, + and forget them. */ + IP_NF_ASSERT(dev->ifindex != 0); + + ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); + } + + return NOTIFY_DONE; +} + +static int masq_inet_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; + + if (event == NETDEV_DOWN) { + /* IP address was deleted. Search entire table for + conntracks which were associated with that device, + and forget them. */ + IP_NF_ASSERT(dev->ifindex != 0); + + ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); + } + + return NOTIFY_DONE; +} + +static struct notifier_block masq_dev_notifier = { + .notifier_call = masq_device_event, +}; + +static struct notifier_block masq_inet_notifier = { + .notifier_call = masq_inet_event, +}; + +static struct ipt_target masquerade = { + .name = "MASQUERADE", + .target = masquerade_target, + .checkentry = masquerade_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + int ret; + + ret = ipt_register_target(&masquerade); + + if (ret == 0) { + /* Register for device down reports */ + register_netdevice_notifier(&masq_dev_notifier); + /* Register IP address change reports */ + register_inetaddr_notifier(&masq_inet_notifier); + } + + return ret; +} + +static void __exit fini(void) +{ + ipt_unregister_target(&masquerade); + unregister_netdevice_notifier(&masq_dev_notifier); + unregister_inetaddr_notifier(&masq_inet_notifier); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c new file mode 100644 index 000000000000..06254b29d034 --- /dev/null +++ b/net/ipv4/netfilter/ipt_NETMAP.c @@ -0,0 +1,117 @@ +/* NETMAP - static NAT mapping of IP network addresses (1:1). + * The mapping can be applied to source (POSTROUTING), + * destination (PREROUTING), or both (with separate rules). + */ + +/* (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/config.h> +#include <linux/ip.h> +#include <linux/module.h> +#include <linux/netdevice.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> + +#define MODULENAME "NETMAP" +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>"); +MODULE_DESCRIPTION("iptables 1:1 NAT mapping of IP networks target"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int +check(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ip_nat_multi_range_compat *mr = targinfo; + + if (strcmp(tablename, "nat") != 0) { + DEBUGP(MODULENAME":check: bad table `%s'.\n", tablename); + return 0; + } + if (targinfosize != IPT_ALIGN(sizeof(*mr))) { + DEBUGP(MODULENAME":check: size %u.\n", targinfosize); + return 0; + } + if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) { + DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask); + return 0; + } + if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) { + DEBUGP(MODULENAME":check: bad MAP_IPS.\n"); + return 0; + } + if (mr->rangesize != 1) { + DEBUGP(MODULENAME":check: bad rangesize %u.\n", mr->rangesize); + return 0; + } + return 1; +} + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + u_int32_t new_ip, netmask; + const struct ip_nat_multi_range_compat *mr = targinfo; + struct ip_nat_range newrange; + + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + || hooknum == NF_IP_POST_ROUTING); + ct = ip_conntrack_get(*pskb, &ctinfo); + + netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); + + if (hooknum == NF_IP_PRE_ROUTING) + new_ip = (*pskb)->nh.iph->daddr & ~netmask; + else + new_ip = (*pskb)->nh.iph->saddr & ~netmask; + new_ip |= mr->range[0].min_ip & netmask; + + newrange = ((struct ip_nat_range) + { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, + new_ip, new_ip, + mr->range[0].min, mr->range[0].max }); + + /* Hand modified range to generic setup. */ + return ip_nat_setup_info(ct, &newrange, hooknum); +} + +static struct ipt_target target_module = { + .name = MODULENAME, + .target = target, + .checkentry = check, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_target(&target_module); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&target_module); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_NOTRACK.c b/net/ipv4/netfilter/ipt_NOTRACK.c new file mode 100644 index 000000000000..a4bb9b3bc292 --- /dev/null +++ b/net/ipv4/netfilter/ipt_NOTRACK.c @@ -0,0 +1,76 @@ +/* This is a module which is used for setting up fake conntracks + * on packets so that they are not seen by the conntrack/NAT code. + */ +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + /* Previously seen (loopback)? Ignore. */ + if ((*pskb)->nfct != NULL) + return IPT_CONTINUE; + + /* Attach fake conntrack entry. + If there is a real ct entry correspondig to this packet, + it'll hang aroun till timing out. We don't deal with it + for performance reasons. JK */ + (*pskb)->nfct = &ip_conntrack_untracked.ct_general; + (*pskb)->nfctinfo = IP_CT_NEW; + nf_conntrack_get((*pskb)->nfct); + + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != 0) { + printk(KERN_WARNING "NOTRACK: targinfosize %u != 0\n", + targinfosize); + return 0; + } + + if (strcmp(tablename, "raw") != 0) { + printk(KERN_WARNING "NOTRACK: can only be called from \"raw\" table, not \"%s\"\n", tablename); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_notrack_reg = { + .name = "NOTRACK", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + if (ipt_register_target(&ipt_notrack_reg)) + return -EINVAL; + + return 0; +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_notrack_reg); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c new file mode 100644 index 000000000000..d2e13447678e --- /dev/null +++ b/net/ipv4/netfilter/ipt_REDIRECT.c @@ -0,0 +1,129 @@ +/* Redirect. Simple mapping which alters dst to a local IP address. */ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/netdevice.h> +#include <linux/if.h> +#include <linux/inetdevice.h> +#include <net/protocol.h> +#include <net/checksum.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("iptables REDIRECT target module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* FIXME: Take multiple ranges --RR */ +static int +redirect_check(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ip_nat_multi_range_compat *mr = targinfo; + + if (strcmp(tablename, "nat") != 0) { + DEBUGP("redirect_check: bad table `%s'.\n", table); + return 0; + } + if (targinfosize != IPT_ALIGN(sizeof(*mr))) { + DEBUGP("redirect_check: size %u.\n", targinfosize); + return 0; + } + if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) { + DEBUGP("redirect_check: bad hooks %x.\n", hook_mask); + return 0; + } + if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { + DEBUGP("redirect_check: bad MAP_IPS.\n"); + return 0; + } + if (mr->rangesize != 1) { + DEBUGP("redirect_check: bad rangesize %u.\n", mr->rangesize); + return 0; + } + return 1; +} + +static unsigned int +redirect_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + u_int32_t newdst; + const struct ip_nat_multi_range_compat *mr = targinfo; + struct ip_nat_range newrange; + + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + || hooknum == NF_IP_LOCAL_OUT); + + ct = ip_conntrack_get(*pskb, &ctinfo); + IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + + /* Local packets: make them go to loopback */ + if (hooknum == NF_IP_LOCAL_OUT) + newdst = htonl(0x7F000001); + else { + struct in_device *indev; + + /* Device might not have an associated in_device. */ + indev = (struct in_device *)(*pskb)->dev->ip_ptr; + if (indev == NULL || indev->ifa_list == NULL) + return NF_DROP; + + /* Grab first address on interface. */ + newdst = indev->ifa_list->ifa_local; + } + + /* Transfer from original range. */ + newrange = ((struct ip_nat_range) + { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, + newdst, newdst, + mr->range[0].min, mr->range[0].max }); + + /* Hand modified range to generic setup. */ + return ip_nat_setup_info(ct, &newrange, hooknum); +} + +static struct ipt_target redirect_reg = { + .name = "REDIRECT", + .target = redirect_target, + .checkentry = redirect_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&redirect_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&redirect_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c new file mode 100644 index 000000000000..266d64979286 --- /dev/null +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -0,0 +1,335 @@ +/* + * This is a module which is used for rejecting packets. + * Added support for customized reject packets (Jozsef Kadlecsik). + * Added support for ICMP type-3-code-13 (Maciej Soltysiak). [RFC 1812] + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/tcp.h> +#include <net/route.h> +#include <net/dst.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_REJECT.h> +#ifdef CONFIG_BRIDGE_NETFILTER +#include <linux/netfilter_bridge.h> +#endif + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("iptables REJECT target module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static inline struct rtable *route_reverse(struct sk_buff *skb, + struct tcphdr *tcph, int hook) +{ + struct iphdr *iph = skb->nh.iph; + struct dst_entry *odst; + struct flowi fl = {}; + struct rtable *rt; + + /* We don't require ip forwarding to be enabled to be able to + * send a RST reply for bridged traffic. */ + if (hook != NF_IP_FORWARD +#ifdef CONFIG_BRIDGE_NETFILTER + || (skb->nf_bridge && skb->nf_bridge->mask & BRNF_BRIDGED) +#endif + ) { + fl.nl_u.ip4_u.daddr = iph->saddr; + if (hook == NF_IP_LOCAL_IN) + fl.nl_u.ip4_u.saddr = iph->daddr; + fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); + + if (ip_route_output_key(&rt, &fl) != 0) + return NULL; + } else { + /* non-local src, find valid iif to satisfy + * rp-filter when calling ip_route_input. */ + fl.nl_u.ip4_u.daddr = iph->daddr; + if (ip_route_output_key(&rt, &fl) != 0) + return NULL; + + odst = skb->dst; + if (ip_route_input(skb, iph->saddr, iph->daddr, + RT_TOS(iph->tos), rt->u.dst.dev) != 0) { + dst_release(&rt->u.dst); + return NULL; + } + dst_release(&rt->u.dst); + rt = (struct rtable *)skb->dst; + skb->dst = odst; + + fl.nl_u.ip4_u.daddr = iph->saddr; + fl.nl_u.ip4_u.saddr = iph->daddr; + fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); + } + + if (rt->u.dst.error) { + dst_release(&rt->u.dst); + return NULL; + } + + fl.proto = IPPROTO_TCP; + fl.fl_ip_sport = tcph->dest; + fl.fl_ip_dport = tcph->source; + + if (xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0)) { + dst_release(&rt->u.dst); + rt = NULL; + } + + return rt; +} + +/* Send RST reply */ +static void send_reset(struct sk_buff *oldskb, int hook) +{ + struct sk_buff *nskb; + struct tcphdr _otcph, *oth, *tcph; + struct rtable *rt; + u_int16_t tmp_port; + u_int32_t tmp_addr; + int needs_ack; + int hh_len; + + /* IP header checks: fragment. */ + if (oldskb->nh.iph->frag_off & htons(IP_OFFSET)) + return; + + oth = skb_header_pointer(oldskb, oldskb->nh.iph->ihl * 4, + sizeof(_otcph), &_otcph); + if (oth == NULL) + return; + + /* No RST for RST. */ + if (oth->rst) + return; + + /* FIXME: Check checksum --RR */ + if ((rt = route_reverse(oldskb, oth, hook)) == NULL) + return; + + hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + + /* We need a linear, writeable skb. We also need to expand + headroom in case hh_len of incoming interface < hh_len of + outgoing interface */ + nskb = skb_copy_expand(oldskb, hh_len, skb_tailroom(oldskb), + GFP_ATOMIC); + if (!nskb) { + dst_release(&rt->u.dst); + return; + } + + dst_release(nskb->dst); + nskb->dst = &rt->u.dst; + + /* This packet will not be the same as the other: clear nf fields */ + nf_reset(nskb); + nskb->nfcache = 0; + nskb->nfmark = 0; +#ifdef CONFIG_BRIDGE_NETFILTER + nf_bridge_put(nskb->nf_bridge); + nskb->nf_bridge = NULL; +#endif + + tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl); + + /* Swap source and dest */ + tmp_addr = nskb->nh.iph->saddr; + nskb->nh.iph->saddr = nskb->nh.iph->daddr; + nskb->nh.iph->daddr = tmp_addr; + tmp_port = tcph->source; + tcph->source = tcph->dest; + tcph->dest = tmp_port; + + /* Truncate to length (no data) */ + tcph->doff = sizeof(struct tcphdr)/4; + skb_trim(nskb, nskb->nh.iph->ihl*4 + sizeof(struct tcphdr)); + nskb->nh.iph->tot_len = htons(nskb->len); + + if (tcph->ack) { + needs_ack = 0; + tcph->seq = oth->ack_seq; + tcph->ack_seq = 0; + } else { + needs_ack = 1; + tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + + oldskb->len - oldskb->nh.iph->ihl*4 + - (oth->doff<<2)); + tcph->seq = 0; + } + + /* Reset flags */ + ((u_int8_t *)tcph)[13] = 0; + tcph->rst = 1; + tcph->ack = needs_ack; + + tcph->window = 0; + tcph->urg_ptr = 0; + + /* Adjust TCP checksum */ + tcph->check = 0; + tcph->check = tcp_v4_check(tcph, sizeof(struct tcphdr), + nskb->nh.iph->saddr, + nskb->nh.iph->daddr, + csum_partial((char *)tcph, + sizeof(struct tcphdr), 0)); + + /* Adjust IP TTL, DF */ + nskb->nh.iph->ttl = MAXTTL; + /* Set DF, id = 0 */ + nskb->nh.iph->frag_off = htons(IP_DF); + nskb->nh.iph->id = 0; + + /* Adjust IP checksum */ + nskb->nh.iph->check = 0; + nskb->nh.iph->check = ip_fast_csum((unsigned char *)nskb->nh.iph, + nskb->nh.iph->ihl); + + /* "Never happens" */ + if (nskb->len > dst_mtu(nskb->dst)) + goto free_nskb; + + nf_ct_attach(nskb, oldskb); + + NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev, + dst_output); + return; + + free_nskb: + kfree_skb(nskb); +} + +static inline void send_unreach(struct sk_buff *skb_in, int code) +{ + icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); +} + +static unsigned int reject(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_reject_info *reject = targinfo; + + /* Our naive response construction doesn't deal with IP + options, and probably shouldn't try. */ + if ((*pskb)->nh.iph->ihl<<2 != sizeof(struct iphdr)) + return NF_DROP; + + /* WARNING: This code causes reentry within iptables. + This means that the iptables jump stack is now crap. We + must return an absolute verdict. --RR */ + switch (reject->with) { + case IPT_ICMP_NET_UNREACHABLE: + send_unreach(*pskb, ICMP_NET_UNREACH); + break; + case IPT_ICMP_HOST_UNREACHABLE: + send_unreach(*pskb, ICMP_HOST_UNREACH); + break; + case IPT_ICMP_PROT_UNREACHABLE: + send_unreach(*pskb, ICMP_PROT_UNREACH); + break; + case IPT_ICMP_PORT_UNREACHABLE: + send_unreach(*pskb, ICMP_PORT_UNREACH); + break; + case IPT_ICMP_NET_PROHIBITED: + send_unreach(*pskb, ICMP_NET_ANO); + break; + case IPT_ICMP_HOST_PROHIBITED: + send_unreach(*pskb, ICMP_HOST_ANO); + break; + case IPT_ICMP_ADMIN_PROHIBITED: + send_unreach(*pskb, ICMP_PKT_FILTERED); + break; + case IPT_TCP_RESET: + send_reset(*pskb, hooknum); + case IPT_ICMP_ECHOREPLY: + /* Doesn't happen. */ + break; + } + + return NF_DROP; +} + +static int check(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ipt_reject_info *rejinfo = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_reject_info))) { + DEBUGP("REJECT: targinfosize %u != 0\n", targinfosize); + return 0; + } + + /* Only allow these for packet filtering. */ + if (strcmp(tablename, "filter") != 0) { + DEBUGP("REJECT: bad table `%s'.\n", tablename); + return 0; + } + if ((hook_mask & ~((1 << NF_IP_LOCAL_IN) + | (1 << NF_IP_FORWARD) + | (1 << NF_IP_LOCAL_OUT))) != 0) { + DEBUGP("REJECT: bad hook mask %X\n", hook_mask); + return 0; + } + + if (rejinfo->with == IPT_ICMP_ECHOREPLY) { + printk("REJECT: ECHOREPLY no longer supported.\n"); + return 0; + } else if (rejinfo->with == IPT_TCP_RESET) { + /* Must specify that it's a TCP packet */ + if (e->ip.proto != IPPROTO_TCP + || (e->ip.invflags & IPT_INV_PROTO)) { + DEBUGP("REJECT: TCP_RESET invalid for non-tcp\n"); + return 0; + } + } + + return 1; +} + +static struct ipt_target ipt_reject_reg = { + .name = "REJECT", + .target = reject, + .checkentry = check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_reject_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_reject_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c new file mode 100644 index 000000000000..7a0536d864ac --- /dev/null +++ b/net/ipv4/netfilter/ipt_SAME.c @@ -0,0 +1,211 @@ +/* Same. Just like SNAT, only try to make the connections + * between client A and server B always have the same source ip. + * + * (C) 2000 Paul `Rusty' Russell + * (C) 2001 Martin Josefsson + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 010320 Martin Josefsson <gandalf@wlug.westbo.se> + * * copied ipt_BALANCE.c to ipt_SAME.c and changed a few things. + * 010728 Martin Josefsson <gandalf@wlug.westbo.se> + * * added --nodst to not include destination-ip in new source + * calculations. + * * added some more sanity-checks. + * 010729 Martin Josefsson <gandalf@wlug.westbo.se> + * * fixed a buggy if-statement in same_check(), should have + * used ntohl() but didn't. + * * added support for multiple ranges. IPT_SAME_MAX_RANGE is + * defined in linux/include/linux/netfilter_ipv4/ipt_SAME.h + * and is currently set to 10. + * * added support for 1-address range, nice to have now that + * we have multiple ranges. + */ +#include <linux/types.h> +#include <linux/ip.h> +#include <linux/timer.h> +#include <linux/module.h> +#include <linux/netfilter.h> +#include <linux/netdevice.h> +#include <linux/if.h> +#include <linux/inetdevice.h> +#include <net/protocol.h> +#include <net/checksum.h> +#include <linux/netfilter_ipv4.h> +#include <linux/netfilter_ipv4/ip_nat_rule.h> +#include <linux/netfilter_ipv4/ipt_SAME.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Josefsson <gandalf@wlug.westbo.se>"); +MODULE_DESCRIPTION("iptables special SNAT module for consistent sourceip"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int +same_check(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + unsigned int count, countess, rangeip, index = 0; + struct ipt_same_info *mr = targinfo; + + mr->ipnum = 0; + + if (strcmp(tablename, "nat") != 0) { + DEBUGP("same_check: bad table `%s'.\n", tablename); + return 0; + } + if (targinfosize != IPT_ALIGN(sizeof(*mr))) { + DEBUGP("same_check: size %u.\n", targinfosize); + return 0; + } + if (hook_mask & ~(1 << NF_IP_PRE_ROUTING | 1 << NF_IP_POST_ROUTING)) { + DEBUGP("same_check: bad hooks %x.\n", hook_mask); + return 0; + } + if (mr->rangesize < 1) { + DEBUGP("same_check: need at least one dest range.\n"); + return 0; + } + if (mr->rangesize > IPT_SAME_MAX_RANGE) { + DEBUGP("same_check: too many ranges specified, maximum " + "is %u ranges\n", + IPT_SAME_MAX_RANGE); + return 0; + } + for (count = 0; count < mr->rangesize; count++) { + if (ntohl(mr->range[count].min_ip) > + ntohl(mr->range[count].max_ip)) { + DEBUGP("same_check: min_ip is larger than max_ip in " + "range `%u.%u.%u.%u-%u.%u.%u.%u'.\n", + NIPQUAD(mr->range[count].min_ip), + NIPQUAD(mr->range[count].max_ip)); + return 0; + } + if (!(mr->range[count].flags & IP_NAT_RANGE_MAP_IPS)) { + DEBUGP("same_check: bad MAP_IPS.\n"); + return 0; + } + rangeip = (ntohl(mr->range[count].max_ip) - + ntohl(mr->range[count].min_ip) + 1); + mr->ipnum += rangeip; + + DEBUGP("same_check: range %u, ipnum = %u\n", count, rangeip); + } + DEBUGP("same_check: total ipaddresses = %u\n", mr->ipnum); + + mr->iparray = kmalloc((sizeof(u_int32_t) * mr->ipnum), GFP_KERNEL); + if (!mr->iparray) { + DEBUGP("same_check: Couldn't allocate %u bytes " + "for %u ipaddresses!\n", + (sizeof(u_int32_t) * mr->ipnum), mr->ipnum); + return 0; + } + DEBUGP("same_check: Allocated %u bytes for %u ipaddresses.\n", + (sizeof(u_int32_t) * mr->ipnum), mr->ipnum); + + for (count = 0; count < mr->rangesize; count++) { + for (countess = ntohl(mr->range[count].min_ip); + countess <= ntohl(mr->range[count].max_ip); + countess++) { + mr->iparray[index] = countess; + DEBUGP("same_check: Added ipaddress `%u.%u.%u.%u' " + "in index %u.\n", + HIPQUAD(countess), index); + index++; + } + } + return 1; +} + +static void +same_destroy(void *targinfo, + unsigned int targinfosize) +{ + struct ipt_same_info *mr = targinfo; + + kfree(mr->iparray); + + DEBUGP("same_destroy: Deallocated %u bytes for %u ipaddresses.\n", + (sizeof(u_int32_t) * mr->ipnum), mr->ipnum); +} + +static unsigned int +same_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + u_int32_t tmpip, aindex, new_ip; + const struct ipt_same_info *same = targinfo; + struct ip_nat_range newrange; + const struct ip_conntrack_tuple *t; + + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING || + hooknum == NF_IP_POST_ROUTING); + ct = ip_conntrack_get(*pskb, &ctinfo); + + t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + + /* Base new source on real src ip and optionally dst ip, + giving some hope for consistency across reboots. + Here we calculate the index in same->iparray which + holds the ipaddress we should use */ + + tmpip = ntohl(t->src.ip); + + if (!(same->info & IPT_SAME_NODST)) + tmpip += ntohl(t->dst.ip); + + aindex = tmpip % same->ipnum; + + new_ip = htonl(same->iparray[aindex]); + + DEBUGP("ipt_SAME: src=%u.%u.%u.%u dst=%u.%u.%u.%u, " + "new src=%u.%u.%u.%u\n", + NIPQUAD(t->src.ip), NIPQUAD(t->dst.ip), + NIPQUAD(new_ip)); + + /* Transfer from original range. */ + newrange = ((struct ip_nat_range) + { same->range[0].flags, new_ip, new_ip, + /* FIXME: Use ports from correct range! */ + same->range[0].min, same->range[0].max }); + + /* Hand modified range to generic setup. */ + return ip_nat_setup_info(ct, &newrange, hooknum); +} + +static struct ipt_target same_reg = { + .name = "SAME", + .target = same_target, + .checkentry = same_check, + .destroy = same_destroy, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&same_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&same_reg); +} + +module_init(init); +module_exit(fini); + diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c new file mode 100644 index 000000000000..1049050b2bfb --- /dev/null +++ b/net/ipv4/netfilter/ipt_TCPMSS.c @@ -0,0 +1,262 @@ +/* + * This is a module which is used for setting the MSS option in TCP packets. + * + * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/ip.h> +#include <net/tcp.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_TCPMSS.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +MODULE_DESCRIPTION("iptables TCP MSS modification module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static u_int16_t +cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck) +{ + u_int32_t diffs[] = { oldvalinv, newval }; + return csum_fold(csum_partial((char *)diffs, sizeof(diffs), + oldcheck^0xFFFF)); +} + +static inline unsigned int +optlen(const u_int8_t *opt, unsigned int offset) +{ + /* Beware zero-length options: make finite progress */ + if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0) return 1; + else return opt[offset+1]; +} + +static unsigned int +ipt_tcpmss_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_tcpmss_info *tcpmssinfo = targinfo; + struct tcphdr *tcph; + struct iphdr *iph; + u_int16_t tcplen, newtotlen, oldval, newmss; + unsigned int i; + u_int8_t *opt; + + if (!skb_ip_make_writable(pskb, (*pskb)->len)) + return NF_DROP; + + iph = (*pskb)->nh.iph; + tcplen = (*pskb)->len - iph->ihl*4; + + tcph = (void *)iph + iph->ihl*4; + + /* Since it passed flags test in tcp match, we know it is is + not a fragment, and has data >= tcp header length. SYN + packets should not contain data: if they did, then we risk + running over MTU, sending Frag Needed and breaking things + badly. --RR */ + if (tcplen != tcph->doff*4) { + if (net_ratelimit()) + printk(KERN_ERR + "ipt_tcpmss_target: bad length (%d bytes)\n", + (*pskb)->len); + return NF_DROP; + } + + if(tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) { + if(!(*pskb)->dst) { + if (net_ratelimit()) + printk(KERN_ERR + "ipt_tcpmss_target: no dst?! can't determine path-MTU\n"); + return NF_DROP; /* or IPT_CONTINUE ?? */ + } + + if(dst_mtu((*pskb)->dst) <= (sizeof(struct iphdr) + sizeof(struct tcphdr))) { + if (net_ratelimit()) + printk(KERN_ERR + "ipt_tcpmss_target: unknown or invalid path-MTU (%d)\n", dst_mtu((*pskb)->dst)); + return NF_DROP; /* or IPT_CONTINUE ?? */ + } + + newmss = dst_mtu((*pskb)->dst) - sizeof(struct iphdr) - sizeof(struct tcphdr); + } else + newmss = tcpmssinfo->mss; + + opt = (u_int8_t *)tcph; + for (i = sizeof(struct tcphdr); i < tcph->doff*4; i += optlen(opt, i)){ + if ((opt[i] == TCPOPT_MSS) && + ((tcph->doff*4 - i) >= TCPOLEN_MSS) && + (opt[i+1] == TCPOLEN_MSS)) { + u_int16_t oldmss; + + oldmss = (opt[i+2] << 8) | opt[i+3]; + + if((tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) && + (oldmss <= newmss)) + return IPT_CONTINUE; + + opt[i+2] = (newmss & 0xff00) >> 8; + opt[i+3] = (newmss & 0x00ff); + + tcph->check = cheat_check(htons(oldmss)^0xFFFF, + htons(newmss), + tcph->check); + + DEBUGP(KERN_INFO "ipt_tcpmss_target: %u.%u.%u.%u:%hu" + "->%u.%u.%u.%u:%hu changed TCP MSS option" + " (from %u to %u)\n", + NIPQUAD((*pskb)->nh.iph->saddr), + ntohs(tcph->source), + NIPQUAD((*pskb)->nh.iph->daddr), + ntohs(tcph->dest), + oldmss, newmss); + goto retmodified; + } + } + + /* + * MSS Option not found ?! add it.. + */ + if (skb_tailroom((*pskb)) < TCPOLEN_MSS) { + struct sk_buff *newskb; + + newskb = skb_copy_expand(*pskb, skb_headroom(*pskb), + TCPOLEN_MSS, GFP_ATOMIC); + if (!newskb) { + if (net_ratelimit()) + printk(KERN_ERR "ipt_tcpmss_target:" + " unable to allocate larger skb\n"); + return NF_DROP; + } + + kfree_skb(*pskb); + *pskb = newskb; + iph = (*pskb)->nh.iph; + tcph = (void *)iph + iph->ihl*4; + } + + skb_put((*pskb), TCPOLEN_MSS); + + opt = (u_int8_t *)tcph + sizeof(struct tcphdr); + memmove(opt + TCPOLEN_MSS, opt, tcplen - sizeof(struct tcphdr)); + + tcph->check = cheat_check(htons(tcplen) ^ 0xFFFF, + htons(tcplen + TCPOLEN_MSS), tcph->check); + tcplen += TCPOLEN_MSS; + + opt[0] = TCPOPT_MSS; + opt[1] = TCPOLEN_MSS; + opt[2] = (newmss & 0xff00) >> 8; + opt[3] = (newmss & 0x00ff); + + tcph->check = cheat_check(~0, *((u_int32_t *)opt), tcph->check); + + oldval = ((u_int16_t *)tcph)[6]; + tcph->doff += TCPOLEN_MSS/4; + tcph->check = cheat_check(oldval ^ 0xFFFF, + ((u_int16_t *)tcph)[6], tcph->check); + + newtotlen = htons(ntohs(iph->tot_len) + TCPOLEN_MSS); + iph->check = cheat_check(iph->tot_len ^ 0xFFFF, + newtotlen, iph->check); + iph->tot_len = newtotlen; + + DEBUGP(KERN_INFO "ipt_tcpmss_target: %u.%u.%u.%u:%hu" + "->%u.%u.%u.%u:%hu added TCP MSS option (%u)\n", + NIPQUAD((*pskb)->nh.iph->saddr), + ntohs(tcph->source), + NIPQUAD((*pskb)->nh.iph->daddr), + ntohs(tcph->dest), + newmss); + + retmodified: + /* We never hw checksum SYN packets. */ + BUG_ON((*pskb)->ip_summed == CHECKSUM_HW); + + (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED; + return IPT_CONTINUE; +} + +#define TH_SYN 0x02 + +static inline int find_syn_match(const struct ipt_entry_match *m) +{ + const struct ipt_tcp *tcpinfo = (const struct ipt_tcp *)m->data; + + if (strcmp(m->u.kernel.match->name, "tcp") == 0 + && (tcpinfo->flg_cmp & TH_SYN) + && !(tcpinfo->invflags & IPT_TCP_INV_FLAGS)) + return 1; + + return 0; +} + +/* Must specify -p tcp --syn/--tcp-flags SYN */ +static int +ipt_tcpmss_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ipt_tcpmss_info *tcpmssinfo = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tcpmss_info))) { + DEBUGP("ipt_tcpmss_checkentry: targinfosize %u != %u\n", + targinfosize, IPT_ALIGN(sizeof(struct ipt_tcpmss_info))); + return 0; + } + + + if((tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) && + ((hook_mask & ~((1 << NF_IP_FORWARD) + | (1 << NF_IP_LOCAL_OUT) + | (1 << NF_IP_POST_ROUTING))) != 0)) { + printk("TCPMSS: path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); + return 0; + } + + if (e->ip.proto == IPPROTO_TCP + && !(e->ip.invflags & IPT_INV_PROTO) + && IPT_MATCH_ITERATE(e, find_syn_match)) + return 1; + + printk("TCPMSS: Only works on TCP SYN packets\n"); + return 0; +} + +static struct ipt_target ipt_tcpmss_reg = { + .name = "TCPMSS", + .target = ipt_tcpmss_target, + .checkentry = ipt_tcpmss_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_tcpmss_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_tcpmss_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c new file mode 100644 index 000000000000..85c70d240f8b --- /dev/null +++ b/net/ipv4/netfilter/ipt_TOS.c @@ -0,0 +1,105 @@ +/* This is a module which is used for setting the TOS field of a packet. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <net/checksum.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_TOS.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("iptables TOS mangling module"); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_tos_target_info *tosinfo = targinfo; + + if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) { + u_int16_t diffs[2]; + + if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + return NF_DROP; + + diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; + (*pskb)->nh.iph->tos + = ((*pskb)->nh.iph->tos & IPTOS_PREC_MASK) + | tosinfo->tos; + diffs[1] = htons((*pskb)->nh.iph->tos); + (*pskb)->nh.iph->check + = csum_fold(csum_partial((char *)diffs, + sizeof(diffs), + (*pskb)->nh.iph->check + ^0xFFFF)); + (*pskb)->nfcache |= NFC_ALTERED; + } + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tos_target_info))) { + printk(KERN_WARNING "TOS: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_tos_target_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "TOS: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if (tos != IPTOS_LOWDELAY + && tos != IPTOS_THROUGHPUT + && tos != IPTOS_RELIABILITY + && tos != IPTOS_MINCOST + && tos != IPTOS_NORMALSVC) { + printk(KERN_WARNING "TOS: bad tos value %#x\n", tos); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_tos_reg = { + .name = "TOS", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_tos_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_tos_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c new file mode 100644 index 000000000000..6f2cefbe16cd --- /dev/null +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -0,0 +1,419 @@ +/* + * netfilter module for userspace packet logging daemons + * + * (C) 2000-2004 by Harald Welte <laforge@netfilter.org> + * + * 2000/09/22 ulog-cprange feature added + * 2001/01/04 in-kernel queue as proposed by Sebastian Zander + * <zander@fokus.gmd.de> + * 2001/01/30 per-rule nlgroup conflicts with global queue. + * nlgroup now global (sysctl) + * 2001/04/19 ulog-queue reworked, now fixed buffer size specified at + * module loadtime -HW + * 2002/07/07 remove broken nflog_rcv() function -HW + * 2002/08/29 fix shifted/unshifted nlgroup bug -HW + * 2002/10/30 fix uninitialized mac_len field - <Anders K. Pedersen> + * 2004/10/25 fix erroneous calculation of 'len' parameter to NLMSG_PUT + * resulting in bogus 'error during NLMSG_PUT' messages. + * + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This module accepts two parameters: + * + * nlbufsiz: + * The parameter specifies how big the buffer for each netlink multicast + * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will + * get accumulated in the kernel until they are sent to userspace. It is + * NOT possible to allocate more than 128kB, and it is strongly discouraged, + * because atomically allocating 128kB inside the network rx softirq is not + * reliable. Please also keep in mind that this buffer size is allocated for + * each nlgroup you are using, so the total kernel memory usage increases + * by that factor. + * + * flushtimeout: + * Specify, after how many hundredths of a second the queue should be + * flushed even if it is not full yet. + * + * ipt_ULOG.c,v 1.22 2002/10/30 09:07:31 laforge Exp + */ + +#include <linux/module.h> +#include <linux/config.h> +#include <linux/spinlock.h> +#include <linux/socket.h> +#include <linux/skbuff.h> +#include <linux/kernel.h> +#include <linux/timer.h> +#include <linux/netlink.h> +#include <linux/netdevice.h> +#include <linux/mm.h> +#include <linux/moduleparam.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_ULOG.h> +#include <linux/netfilter_ipv4/lockhelp.h> +#include <net/sock.h> +#include <linux/bitops.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>"); +MODULE_DESCRIPTION("iptables userspace logging module"); + +#define ULOG_NL_EVENT 111 /* Harald's favorite number */ +#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ + +#if 0 +#define DEBUGP(format, args...) printk("%s:%s:" format, \ + __FILE__, __FUNCTION__ , ## args) +#else +#define DEBUGP(format, args...) +#endif + +#define PRINTR(format, args...) do { if (net_ratelimit()) printk(format , ## args); } while (0) + +static unsigned int nlbufsiz = 4096; +module_param(nlbufsiz, uint, 0600); /* FIXME: Check size < 128k --RR */ +MODULE_PARM_DESC(nlbufsiz, "netlink buffer size"); + +static unsigned int flushtimeout = 10; +module_param(flushtimeout, int, 0600); +MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)"); + +static unsigned int nflog = 1; +module_param(nflog, int, 0400); +MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); + +/* global data structures */ + +typedef struct { + unsigned int qlen; /* number of nlmsgs' in the skb */ + struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */ + struct sk_buff *skb; /* the pre-allocated skb */ + struct timer_list timer; /* the timer function */ +} ulog_buff_t; + +static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */ + +static struct sock *nflognl; /* our socket */ +static DECLARE_LOCK(ulog_lock); /* spinlock */ + +/* send one ulog_buff_t to userspace */ +static void ulog_send(unsigned int nlgroupnum) +{ + ulog_buff_t *ub = &ulog_buffers[nlgroupnum]; + + if (timer_pending(&ub->timer)) { + DEBUGP("ipt_ULOG: ulog_send: timer was pending, deleting\n"); + del_timer(&ub->timer); + } + + /* last nlmsg needs NLMSG_DONE */ + if (ub->qlen > 1) + ub->lastnlh->nlmsg_type = NLMSG_DONE; + + NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum); + DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n", + ub->qlen, nlgroupnum); + netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC); + + ub->qlen = 0; + ub->skb = NULL; + ub->lastnlh = NULL; + +} + + +/* timer function to flush queue in flushtimeout time */ +static void ulog_timer(unsigned long data) +{ + DEBUGP("ipt_ULOG: timer function called, calling ulog_send\n"); + + /* lock to protect against somebody modifying our structure + * from ipt_ulog_target at the same time */ + LOCK_BH(&ulog_lock); + ulog_send(data); + UNLOCK_BH(&ulog_lock); +} + +static struct sk_buff *ulog_alloc_skb(unsigned int size) +{ + struct sk_buff *skb; + + /* alloc skb which should be big enough for a whole + * multipart message. WARNING: has to be <= 131000 + * due to slab allocator restrictions */ + + skb = alloc_skb(nlbufsiz, GFP_ATOMIC); + if (!skb) { + PRINTR("ipt_ULOG: can't alloc whole buffer %ub!\n", + nlbufsiz); + + /* try to allocate only as much as we need for + * current packet */ + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + PRINTR("ipt_ULOG: can't even allocate %ub\n", size); + } + + return skb; +} + +static void ipt_ulog_packet(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct ipt_ulog_info *loginfo, + const char *prefix) +{ + ulog_buff_t *ub; + ulog_packet_msg_t *pm; + size_t size, copy_len; + struct nlmsghdr *nlh; + + /* ffs == find first bit set, necessary because userspace + * is already shifting groupnumber, but we need unshifted. + * ffs() returns [1..32], we need [0..31] */ + unsigned int groupnum = ffs(loginfo->nl_group) - 1; + + /* calculate the size of the skb needed */ + if ((loginfo->copy_range == 0) || + (loginfo->copy_range > skb->len)) { + copy_len = skb->len; + } else { + copy_len = loginfo->copy_range; + } + + size = NLMSG_SPACE(sizeof(*pm) + copy_len); + + ub = &ulog_buffers[groupnum]; + + LOCK_BH(&ulog_lock); + + if (!ub->skb) { + if (!(ub->skb = ulog_alloc_skb(size))) + goto alloc_failure; + } else if (ub->qlen >= loginfo->qthreshold || + size > skb_tailroom(ub->skb)) { + /* either the queue len is too high or we don't have + * enough room in nlskb left. send it to userspace. */ + + ulog_send(groupnum); + + if (!(ub->skb = ulog_alloc_skb(size))) + goto alloc_failure; + } + + DEBUGP("ipt_ULOG: qlen %d, qthreshold %d\n", ub->qlen, + loginfo->qthreshold); + + /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */ + nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, + sizeof(*pm)+copy_len); + ub->qlen++; + + pm = NLMSG_DATA(nlh); + + /* We might not have a timestamp, get one */ + if (skb->stamp.tv_sec == 0) + do_gettimeofday((struct timeval *)&skb->stamp); + + /* copy hook, prefix, timestamp, payload, etc. */ + pm->data_len = copy_len; + pm->timestamp_sec = skb->stamp.tv_sec; + pm->timestamp_usec = skb->stamp.tv_usec; + pm->mark = skb->nfmark; + pm->hook = hooknum; + if (prefix != NULL) + strncpy(pm->prefix, prefix, sizeof(pm->prefix)); + else if (loginfo->prefix[0] != '\0') + strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix)); + else + *(pm->prefix) = '\0'; + + if (in && in->hard_header_len > 0 + && skb->mac.raw != (void *) skb->nh.iph + && in->hard_header_len <= ULOG_MAC_LEN) { + memcpy(pm->mac, skb->mac.raw, in->hard_header_len); + pm->mac_len = in->hard_header_len; + } else + pm->mac_len = 0; + + if (in) + strncpy(pm->indev_name, in->name, sizeof(pm->indev_name)); + else + pm->indev_name[0] = '\0'; + + if (out) + strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name)); + else + pm->outdev_name[0] = '\0'; + + /* copy_len <= skb->len, so can't fail. */ + if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0) + BUG(); + + /* check if we are building multi-part messages */ + if (ub->qlen > 1) { + ub->lastnlh->nlmsg_flags |= NLM_F_MULTI; + } + + ub->lastnlh = nlh; + + /* if timer isn't already running, start it */ + if (!timer_pending(&ub->timer)) { + ub->timer.expires = jiffies + flushtimeout * HZ / 100; + add_timer(&ub->timer); + } + + /* if threshold is reached, send message to userspace */ + if (ub->qlen >= loginfo->qthreshold) { + if (loginfo->qthreshold > 1) + nlh->nlmsg_type = NLMSG_DONE; + ulog_send(groupnum); + } + + UNLOCK_BH(&ulog_lock); + + return; + +nlmsg_failure: + PRINTR("ipt_ULOG: error during NLMSG_PUT\n"); + +alloc_failure: + PRINTR("ipt_ULOG: Error building netlink message\n"); + + UNLOCK_BH(&ulog_lock); +} + +static unsigned int ipt_ulog_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, void *userinfo) +{ + struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo; + + ipt_ulog_packet(hooknum, *pskb, in, out, loginfo, NULL); + + return IPT_CONTINUE; +} + +static void ipt_logfn(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const char *prefix) +{ + struct ipt_ulog_info loginfo = { + .nl_group = ULOG_DEFAULT_NLGROUP, + .copy_range = 0, + .qthreshold = ULOG_DEFAULT_QTHRESHOLD, + .prefix = "" + }; + + ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); +} + +static int ipt_ulog_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hookmask) +{ + struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_ulog_info))) { + DEBUGP("ipt_ULOG: targinfosize %u != 0\n", targinfosize); + return 0; + } + + if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') { + DEBUGP("ipt_ULOG: prefix term %i\n", + loginfo->prefix[sizeof(loginfo->prefix) - 1]); + return 0; + } + + if (loginfo->qthreshold > ULOG_MAX_QLEN) { + DEBUGP("ipt_ULOG: queue threshold %i > MAX_QLEN\n", + loginfo->qthreshold); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_ulog_reg = { + .name = "ULOG", + .target = ipt_ulog_target, + .checkentry = ipt_ulog_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + int i; + + DEBUGP("ipt_ULOG: init module\n"); + + if (nlbufsiz >= 128*1024) { + printk("Netlink buffer has to be <= 128kB\n"); + return -EINVAL; + } + + /* initialize ulog_buffers */ + for (i = 0; i < ULOG_MAXNLGROUPS; i++) { + init_timer(&ulog_buffers[i].timer); + ulog_buffers[i].timer.function = ulog_timer; + ulog_buffers[i].timer.data = i; + } + + nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL); + if (!nflognl) + return -ENOMEM; + + if (ipt_register_target(&ipt_ulog_reg) != 0) { + sock_release(nflognl->sk_socket); + return -EINVAL; + } + if (nflog) + nf_log_register(PF_INET, &ipt_logfn); + + return 0; +} + +static void __exit fini(void) +{ + ulog_buff_t *ub; + int i; + + DEBUGP("ipt_ULOG: cleanup_module\n"); + + if (nflog) + nf_log_unregister(PF_INET, &ipt_logfn); + ipt_unregister_target(&ipt_ulog_reg); + sock_release(nflognl->sk_socket); + + /* remove pending timers and free allocated skb's */ + for (i = 0; i < ULOG_MAXNLGROUPS; i++) { + ub = &ulog_buffers[i]; + if (timer_pending(&ub->timer)) { + DEBUGP("timer was pending, deleting\n"); + del_timer(&ub->timer); + } + + if (ub->skb) { + kfree_skb(ub->skb); + ub->skb = NULL; + } + } + +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c new file mode 100644 index 000000000000..f5909a4c3fc7 --- /dev/null +++ b/net/ipv4/netfilter/ipt_addrtype.c @@ -0,0 +1,77 @@ +/* + * iptables module to match inet_addr_type() of an ip. + * + * Copyright (c) 2004 Patrick McHardy <kaber@trash.net> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <linux/ip.h> +#include <net/route.h> + +#include <linux/netfilter_ipv4/ipt_addrtype.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_DESCRIPTION("iptables addrtype match"); + +static inline int match_type(u_int32_t addr, u_int16_t mask) +{ + return !!(mask & (1 << inet_addr_type(addr))); +} + +static int match(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *matchinfo, + int offset, int *hotdrop) +{ + const struct ipt_addrtype_info *info = matchinfo; + const struct iphdr *iph = skb->nh.iph; + int ret = 1; + + if (info->source) + ret &= match_type(iph->saddr, info->source)^info->invert_source; + if (info->dest) + ret &= match_type(iph->daddr, info->dest)^info->invert_dest; + + return ret; +} + +static int checkentry(const char *tablename, const struct ipt_ip *ip, + void *matchinfo, unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_addrtype_info))) { + printk(KERN_ERR "ipt_addrtype: invalid size (%u != %Zu)\n.", + matchsize, IPT_ALIGN(sizeof(struct ipt_addrtype_info))); + return 0; + } + + return 1; +} + +static struct ipt_match addrtype_match = { + .name = "addrtype", + .match = match, + .checkentry = checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&addrtype_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&addrtype_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c new file mode 100644 index 000000000000..a0fea847cb72 --- /dev/null +++ b/net/ipv4/netfilter/ipt_ah.c @@ -0,0 +1,117 @@ +/* Kernel module to match AH parameters. */ +/* (C) 1999-2000 Yon Uriarte <yon@astaro.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> + +#include <linux/netfilter_ipv4/ipt_ah.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>"); +MODULE_DESCRIPTION("iptables AH SPI match module"); + +#ifdef DEBUG_CONNTRACK +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline int +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert) +{ + int r=0; + duprintf("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ', + min,spi,max); + r=(spi >= min && spi <= max) ^ invert; + duprintf(" result %s\n",r? "PASS" : "FAILED"); + return r; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct ip_auth_hdr _ahdr, *ah; + const struct ipt_ah *ahinfo = matchinfo; + + /* Must not be a fragment. */ + if (offset) + return 0; + + ah = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_ahdr), &_ahdr); + if (ah == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("Dropping evil AH tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return spi_match(ahinfo->spis[0], ahinfo->spis[1], + ntohl(ah->spi), + !!(ahinfo->invflags & IPT_AH_INV_SPI)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ipt_ah *ahinfo = matchinfo; + + /* Must specify proto == AH, and no unknown invflags */ + if (ip->proto != IPPROTO_AH || (ip->invflags & IPT_INV_PROTO)) { + duprintf("ipt_ah: Protocol %u != %u\n", ip->proto, + IPPROTO_AH); + return 0; + } + if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_ah))) { + duprintf("ipt_ah: matchsize %u != %u\n", + matchinfosize, IPT_ALIGN(sizeof(struct ipt_ah))); + return 0; + } + if (ahinfo->invflags & ~IPT_AH_INV_MASK) { + duprintf("ipt_ah: unknown flags %X\n", + ahinfo->invflags); + return 0; + } + + return 1; +} + +static struct ipt_match ah_match = { + .name = "ah", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&ah_match); +} + +static void __exit cleanup(void) +{ + ipt_unregister_match(&ah_match); +} + +module_init(init); +module_exit(cleanup); diff --git a/net/ipv4/netfilter/ipt_comment.c b/net/ipv4/netfilter/ipt_comment.c new file mode 100644 index 000000000000..6b76a1ea5245 --- /dev/null +++ b/net/ipv4/netfilter/ipt_comment.c @@ -0,0 +1,59 @@ +/* + * Implements a dummy match to allow attaching comments to rules + * + * 2003-05-13 Brad Fisher (brad@info-link.net) + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_comment.h> + +MODULE_AUTHOR("Brad Fisher <brad@info-link.net>"); +MODULE_DESCRIPTION("iptables comment match module"); +MODULE_LICENSE("GPL"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + /* We always match */ + return 1; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + /* Check the size */ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_comment_info))) + return 0; + return 1; +} + +static struct ipt_match comment_match = { + .name = "comment", + .match = match, + .checkentry = checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&comment_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&comment_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c new file mode 100644 index 000000000000..2706f96cea55 --- /dev/null +++ b/net/ipv4/netfilter/ipt_connmark.c @@ -0,0 +1,81 @@ +/* This kernel module matches connection mark values set by the + * CONNMARK target + * + * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> + * by Henrik Nordstrom <hno@marasystems.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>"); +MODULE_DESCRIPTION("IP tables connmark match module"); +MODULE_LICENSE("GPL"); + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_connmark.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_connmark_info *info = matchinfo; + enum ip_conntrack_info ctinfo; + struct ip_conntrack *ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); + if (!ct) + return 0; + + return ((ct->mark & info->mask) == info->mark) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info))) + return 0; + + return 1; +} + +static struct ipt_match connmark_match = { + .name = "connmark", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&connmark_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&connmark_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_conntrack.c b/net/ipv4/netfilter/ipt_conntrack.c new file mode 100644 index 000000000000..c1d22801b7cf --- /dev/null +++ b/net/ipv4/netfilter/ipt_conntrack.c @@ -0,0 +1,136 @@ +/* Kernel module to match connection tracking information. + * Superset of Rusty's minimalistic state match. + * + * (C) 2001 Marc Boucher (marc@mbsi.ca). + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_conntrack.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +MODULE_DESCRIPTION("iptables connection tracking match module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_conntrack_info *sinfo = matchinfo; + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + unsigned int statebit; + + ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); + +#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg)) + + if (ct == &ip_conntrack_untracked) + statebit = IPT_CONNTRACK_STATE_UNTRACKED; + else if (ct) + statebit = IPT_CONNTRACK_STATE_BIT(ctinfo); + else + statebit = IPT_CONNTRACK_STATE_INVALID; + + if(sinfo->flags & IPT_CONNTRACK_STATE) { + if (ct) { + if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip != + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip) + statebit |= IPT_CONNTRACK_STATE_SNAT; + + if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip != + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip) + statebit |= IPT_CONNTRACK_STATE_DNAT; + } + + if (FWINV((statebit & sinfo->statemask) == 0, IPT_CONNTRACK_STATE)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_PROTO) { + if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, IPT_CONNTRACK_PROTO)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_ORIGSRC) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, IPT_CONNTRACK_ORIGSRC)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_ORIGDST) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, IPT_CONNTRACK_ORIGDST)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_REPLSRC) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, IPT_CONNTRACK_REPLSRC)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_REPLDST) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, IPT_CONNTRACK_REPLDST)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_STATUS) { + if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, IPT_CONNTRACK_STATUS)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_EXPIRES) { + unsigned long expires; + + if(!ct) + return 0; + + expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0; + + if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), IPT_CONNTRACK_EXPIRES)) + return 0; + } + + return 1; +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_conntrack_info))) + return 0; + + return 1; +} + +static struct ipt_match conntrack_match = { + .name = "conntrack", + .match = &match, + .checkentry = &check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + need_ip_conntrack(); + return ipt_register_match(&conntrack_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&conntrack_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_dscp.c b/net/ipv4/netfilter/ipt_dscp.c new file mode 100644 index 000000000000..5df52a64a5d4 --- /dev/null +++ b/net/ipv4/netfilter/ipt_dscp.c @@ -0,0 +1,63 @@ +/* IP tables module for matching the value of the IPv4 DSCP field + * + * ipt_dscp.c,v 1.3 2002/08/05 19:00:21 laforge Exp + * + * (C) 2002 by Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv4/ipt_dscp.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("iptables DSCP matching module"); +MODULE_LICENSE("GPL"); + +static int match(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *matchinfo, + int offset, int *hotdrop) +{ + const struct ipt_dscp_info *info = matchinfo; + const struct iphdr *iph = skb->nh.iph; + + u_int8_t sh_dscp = ((info->dscp << IPT_DSCP_SHIFT) & IPT_DSCP_MASK); + + return ((iph->tos&IPT_DSCP_MASK) == sh_dscp) ^ info->invert; +} + +static int checkentry(const char *tablename, const struct ipt_ip *ip, + void *matchinfo, unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_dscp_info))) + return 0; + + return 1; +} + +static struct ipt_match dscp_match = { + .name = "dscp", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&dscp_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&dscp_match); + +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c new file mode 100644 index 000000000000..b6f7181e89cc --- /dev/null +++ b/net/ipv4/netfilter/ipt_ecn.c @@ -0,0 +1,131 @@ +/* IP tables module for matching the value of the IPv4 and TCP ECN bits + * + * ipt_ecn.c,v 1.3 2002/05/29 15:09:00 laforge Exp + * + * (C) 2002 by Harald Welte <laforge@gnumonks.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/tcp.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_ecn.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("iptables ECN matching module"); +MODULE_LICENSE("GPL"); + +static inline int match_ip(const struct sk_buff *skb, + const struct ipt_ecn_info *einfo) +{ + return ((skb->nh.iph->tos&IPT_ECN_IP_MASK) == einfo->ip_ect); +} + +static inline int match_tcp(const struct sk_buff *skb, + const struct ipt_ecn_info *einfo, + int *hotdrop) +{ + struct tcphdr _tcph, *th; + + /* In practice, TCP match does this, so can't fail. But let's + * be good citizens. + */ + th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) { + *hotdrop = 0; + return 0; + } + + if (einfo->operation & IPT_ECN_OP_MATCH_ECE) { + if (einfo->invert & IPT_ECN_OP_MATCH_ECE) { + if (th->ece == 1) + return 0; + } else { + if (th->ece == 0) + return 0; + } + } + + if (einfo->operation & IPT_ECN_OP_MATCH_CWR) { + if (einfo->invert & IPT_ECN_OP_MATCH_CWR) { + if (th->cwr == 1) + return 0; + } else { + if (th->cwr == 0) + return 0; + } + } + + return 1; +} + +static int match(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *matchinfo, + int offset, int *hotdrop) +{ + const struct ipt_ecn_info *info = matchinfo; + + if (info->operation & IPT_ECN_OP_MATCH_IP) + if (!match_ip(skb, info)) + return 0; + + if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { + if (skb->nh.iph->protocol != IPPROTO_TCP) + return 0; + if (!match_tcp(skb, info, hotdrop)) + return 0; + } + + return 1; +} + +static int checkentry(const char *tablename, const struct ipt_ip *ip, + void *matchinfo, unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_ecn_info *info = matchinfo; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_ecn_info))) + return 0; + + if (info->operation & IPT_ECN_OP_MATCH_MASK) + return 0; + + if (info->invert & IPT_ECN_OP_MATCH_MASK) + return 0; + + if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) + && ip->proto != IPPROTO_TCP) { + printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for" + " non-tcp packets\n"); + return 0; + } + + return 1; +} + +static struct ipt_match ecn_match = { + .name = "ecn", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&ecn_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&ecn_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_esp.c b/net/ipv4/netfilter/ipt_esp.c new file mode 100644 index 000000000000..e1d0dd31e117 --- /dev/null +++ b/net/ipv4/netfilter/ipt_esp.c @@ -0,0 +1,118 @@ +/* Kernel module to match ESP parameters. */ + +/* (C) 1999-2000 Yon Uriarte <yon@astaro.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> + +#include <linux/netfilter_ipv4/ipt_esp.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>"); +MODULE_DESCRIPTION("iptables ESP SPI match module"); + +#ifdef DEBUG_CONNTRACK +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline int +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert) +{ + int r=0; + duprintf("esp spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ', + min,spi,max); + r=(spi >= min && spi <= max) ^ invert; + duprintf(" result %s\n",r? "PASS" : "FAILED"); + return r; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct ip_esp_hdr _esp, *eh; + const struct ipt_esp *espinfo = matchinfo; + + /* Must not be a fragment. */ + if (offset) + return 0; + + eh = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_esp), &_esp); + if (eh == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("Dropping evil ESP tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return spi_match(espinfo->spis[0], espinfo->spis[1], + ntohl(eh->spi), + !!(espinfo->invflags & IPT_ESP_INV_SPI)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ipt_esp *espinfo = matchinfo; + + /* Must specify proto == ESP, and no unknown invflags */ + if (ip->proto != IPPROTO_ESP || (ip->invflags & IPT_INV_PROTO)) { + duprintf("ipt_esp: Protocol %u != %u\n", ip->proto, + IPPROTO_ESP); + return 0; + } + if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_esp))) { + duprintf("ipt_esp: matchsize %u != %u\n", + matchinfosize, IPT_ALIGN(sizeof(struct ipt_esp))); + return 0; + } + if (espinfo->invflags & ~IPT_ESP_INV_MASK) { + duprintf("ipt_esp: unknown flags %X\n", + espinfo->invflags); + return 0; + } + + return 1; +} + +static struct ipt_match esp_match = { + .name = "esp", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&esp_match); +} + +static void __exit cleanup(void) +{ + ipt_unregister_match(&esp_match); +} + +module_init(init); +module_exit(cleanup); diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c new file mode 100644 index 000000000000..f1937190cd77 --- /dev/null +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -0,0 +1,731 @@ +/* iptables match extension to limit the number of packets per second + * seperately for each hashbucket (sourceip/sourceport/dstip/dstport) + * + * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> + * + * $Id: ipt_hashlimit.c 3244 2004-10-20 16:24:29Z laforge@netfilter.org $ + * + * Development of this code was funded by Astaro AG, http://www.astaro.com/ + * + * based on ipt_limit.c by: + * Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr> + * Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr> + * Rusty Russell <rusty@rustcorp.com.au> + * + * The general idea is to create a hash table for every dstip and have a + * seperate limit counter per tuple. This way you can do something like 'limit + * the number of syn packets for each of my internal addresses. + * + * Ideally this would just be implemented as a general 'hash' match, which would + * allow us to attach any iptables target to it's hash buckets. But this is + * not possible in the current iptables architecture. As always, pkttables for + * 2.7.x will help ;) + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/random.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/sctp.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/list.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_hashlimit.h> +#include <linux/netfilter_ipv4/lockhelp.h> + +/* FIXME: this is just for IP_NF_ASSERRT */ +#include <linux/netfilter_ipv4/ip_conntrack.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("iptables match for limiting per hash-bucket"); + +/* need to declare this at the top */ +static struct proc_dir_entry *hashlimit_procdir; +static struct file_operations dl_file_ops; + +/* hash table crap */ + +struct dsthash_dst { + u_int32_t src_ip; + u_int32_t dst_ip; + /* ports have to be consecutive !!! */ + u_int16_t src_port; + u_int16_t dst_port; +}; + +struct dsthash_ent { + /* static / read-only parts in the beginning */ + struct hlist_node node; + struct dsthash_dst dst; + + /* modified structure members in the end */ + unsigned long expires; /* precalculated expiry time */ + struct { + unsigned long prev; /* last modification */ + u_int32_t credit; + u_int32_t credit_cap, cost; + } rateinfo; +}; + +struct ipt_hashlimit_htable { + struct hlist_node node; /* global list of all htables */ + atomic_t use; + + struct hashlimit_cfg cfg; /* config */ + + /* used internally */ + spinlock_t lock; /* lock for list_head */ + u_int32_t rnd; /* random seed for hash */ + struct timer_list timer; /* timer for gc */ + atomic_t count; /* number entries in table */ + + /* seq_file stuff */ + struct proc_dir_entry *pde; + + struct hlist_head hash[0]; /* hashtable itself */ +}; + +static DECLARE_LOCK(hashlimit_lock); /* protects htables list */ +static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */ +static HLIST_HEAD(hashlimit_htables); +static kmem_cache_t *hashlimit_cachep; + +static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b) +{ + return (ent->dst.dst_ip == b->dst_ip + && ent->dst.dst_port == b->dst_port + && ent->dst.src_port == b->src_port + && ent->dst.src_ip == b->src_ip); +} + +static inline u_int32_t +hash_dst(const struct ipt_hashlimit_htable *ht, const struct dsthash_dst *dst) +{ + return (jhash_3words(dst->dst_ip, (dst->dst_port<<16 | dst->src_port), + dst->src_ip, ht->rnd) % ht->cfg.size); +} + +static inline struct dsthash_ent * +__dsthash_find(const struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst) +{ + struct dsthash_ent *ent; + struct hlist_node *pos; + u_int32_t hash = hash_dst(ht, dst); + + if (!hlist_empty(&ht->hash[hash])) + hlist_for_each_entry(ent, pos, &ht->hash[hash], node) { + if (dst_cmp(ent, dst)) { + return ent; + } + } + + return NULL; +} + +/* allocate dsthash_ent, initialize dst, put in htable and lock it */ +static struct dsthash_ent * +__dsthash_alloc_init(struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst) +{ + struct dsthash_ent *ent; + + /* initialize hash with random val at the time we allocate + * the first hashtable entry */ + if (!ht->rnd) + get_random_bytes(&ht->rnd, 4); + + if (ht->cfg.max && + atomic_read(&ht->count) >= ht->cfg.max) { + /* FIXME: do something. question is what.. */ + if (net_ratelimit()) + printk(KERN_WARNING + "ipt_hashlimit: max count of %u reached\n", + ht->cfg.max); + return NULL; + } + + ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC); + if (!ent) { + if (net_ratelimit()) + printk(KERN_ERR + "ipt_hashlimit: can't allocate dsthash_ent\n"); + return NULL; + } + + atomic_inc(&ht->count); + + ent->dst.dst_ip = dst->dst_ip; + ent->dst.dst_port = dst->dst_port; + ent->dst.src_ip = dst->src_ip; + ent->dst.src_port = dst->src_port; + + hlist_add_head(&ent->node, &ht->hash[hash_dst(ht, dst)]); + + return ent; +} + +static inline void +__dsthash_free(struct ipt_hashlimit_htable *ht, struct dsthash_ent *ent) +{ + hlist_del(&ent->node); + kmem_cache_free(hashlimit_cachep, ent); + atomic_dec(&ht->count); +} +static void htable_gc(unsigned long htlong); + +static int htable_create(struct ipt_hashlimit_info *minfo) +{ + int i; + unsigned int size; + struct ipt_hashlimit_htable *hinfo; + + if (minfo->cfg.size) + size = minfo->cfg.size; + else { + size = (((num_physpages << PAGE_SHIFT) / 16384) + / sizeof(struct list_head)); + if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + size = 8192; + if (size < 16) + size = 16; + } + /* FIXME: don't use vmalloc() here or anywhere else -HW */ + hinfo = vmalloc(sizeof(struct ipt_hashlimit_htable) + + (sizeof(struct list_head) * size)); + if (!hinfo) { + printk(KERN_ERR "ipt_hashlimit: Unable to create hashtable\n"); + return -1; + } + minfo->hinfo = hinfo; + + /* copy match config into hashtable config */ + memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg)); + hinfo->cfg.size = size; + if (!hinfo->cfg.max) + hinfo->cfg.max = 8 * hinfo->cfg.size; + else if (hinfo->cfg.max < hinfo->cfg.size) + hinfo->cfg.max = hinfo->cfg.size; + + for (i = 0; i < hinfo->cfg.size; i++) + INIT_HLIST_HEAD(&hinfo->hash[i]); + + atomic_set(&hinfo->count, 0); + atomic_set(&hinfo->use, 1); + hinfo->rnd = 0; + spin_lock_init(&hinfo->lock); + hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir); + if (!hinfo->pde) { + vfree(hinfo); + return -1; + } + hinfo->pde->proc_fops = &dl_file_ops; + hinfo->pde->data = hinfo; + + init_timer(&hinfo->timer); + hinfo->timer.expires = jiffies + msecs_to_jiffies(hinfo->cfg.gc_interval); + hinfo->timer.data = (unsigned long )hinfo; + hinfo->timer.function = htable_gc; + add_timer(&hinfo->timer); + + LOCK_BH(&hashlimit_lock); + hlist_add_head(&hinfo->node, &hashlimit_htables); + UNLOCK_BH(&hashlimit_lock); + + return 0; +} + +static int select_all(struct ipt_hashlimit_htable *ht, struct dsthash_ent *he) +{ + return 1; +} + +static int select_gc(struct ipt_hashlimit_htable *ht, struct dsthash_ent *he) +{ + return (jiffies >= he->expires); +} + +static void htable_selective_cleanup(struct ipt_hashlimit_htable *ht, + int (*select)(struct ipt_hashlimit_htable *ht, + struct dsthash_ent *he)) +{ + int i; + + IP_NF_ASSERT(ht->cfg.size && ht->cfg.max); + + /* lock hash table and iterate over it */ + spin_lock_bh(&ht->lock); + for (i = 0; i < ht->cfg.size; i++) { + struct dsthash_ent *dh; + struct hlist_node *pos, *n; + hlist_for_each_entry_safe(dh, pos, n, &ht->hash[i], node) { + if ((*select)(ht, dh)) + __dsthash_free(ht, dh); + } + } + spin_unlock_bh(&ht->lock); +} + +/* hash table garbage collector, run by timer */ +static void htable_gc(unsigned long htlong) +{ + struct ipt_hashlimit_htable *ht = (struct ipt_hashlimit_htable *)htlong; + + htable_selective_cleanup(ht, select_gc); + + /* re-add the timer accordingly */ + ht->timer.expires = jiffies + msecs_to_jiffies(ht->cfg.gc_interval); + add_timer(&ht->timer); +} + +static void htable_destroy(struct ipt_hashlimit_htable *hinfo) +{ + /* remove timer, if it is pending */ + if (timer_pending(&hinfo->timer)) + del_timer(&hinfo->timer); + + /* remove proc entry */ + remove_proc_entry(hinfo->pde->name, hashlimit_procdir); + + htable_selective_cleanup(hinfo, select_all); + vfree(hinfo); +} + +static struct ipt_hashlimit_htable *htable_find_get(char *name) +{ + struct ipt_hashlimit_htable *hinfo; + struct hlist_node *pos; + + LOCK_BH(&hashlimit_lock); + hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) { + if (!strcmp(name, hinfo->pde->name)) { + atomic_inc(&hinfo->use); + UNLOCK_BH(&hashlimit_lock); + return hinfo; + } + } + UNLOCK_BH(&hashlimit_lock); + + return NULL; +} + +static void htable_put(struct ipt_hashlimit_htable *hinfo) +{ + if (atomic_dec_and_test(&hinfo->use)) { + LOCK_BH(&hashlimit_lock); + hlist_del(&hinfo->node); + UNLOCK_BH(&hashlimit_lock); + htable_destroy(hinfo); + } +} + + +/* The algorithm used is the Simple Token Bucket Filter (TBF) + * see net/sched/sch_tbf.c in the linux source tree + */ + +/* Rusty: This is my (non-mathematically-inclined) understanding of + this algorithm. The `average rate' in jiffies becomes your initial + amount of credit `credit' and the most credit you can ever have + `credit_cap'. The `peak rate' becomes the cost of passing the + test, `cost'. + + `prev' tracks the last packet hit: you gain one credit per jiffy. + If you get credit balance more than this, the extra credit is + discarded. Every time the match passes, you lose `cost' credits; + if you don't have that many, the test fails. + + See Alexey's formal explanation in net/sched/sch_tbf.c. + + To get the maximum range, we multiply by this factor (ie. you get N + credits per jiffy). We want to allow a rate as low as 1 per day + (slowest userspace tool allows), which means + CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie. +*/ +#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24)) + +/* Repeated shift and or gives us all 1s, final shift and add 1 gives + * us the power of 2 below the theoretical max, so GCC simply does a + * shift. */ +#define _POW2_BELOW2(x) ((x)|((x)>>1)) +#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2)) +#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4)) +#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8)) +#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) +#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) + +#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) + +/* Precision saver. */ +static inline u_int32_t +user2credits(u_int32_t user) +{ + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + /* Divide first. */ + return (user / IPT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + + return (user * HZ * CREDITS_PER_JIFFY) / IPT_HASHLIMIT_SCALE; +} + +static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now) +{ + dh->rateinfo.credit += (now - xchg(&dh->rateinfo.prev, now)) + * CREDITS_PER_JIFFY; + if (dh->rateinfo.credit > dh->rateinfo.credit_cap) + dh->rateinfo.credit = dh->rateinfo.credit_cap; +} + +static inline int get_ports(const struct sk_buff *skb, int offset, + u16 ports[2]) +{ + union { + struct tcphdr th; + struct udphdr uh; + sctp_sctphdr_t sctph; + } hdr_u, *ptr_u; + + /* Must not be a fragment. */ + if (offset) + return 1; + + /* Must be big enough to read ports (both UDP and TCP have + them at the start). */ + ptr_u = skb_header_pointer(skb, skb->nh.iph->ihl*4, 8, &hdr_u); + if (!ptr_u) + return 1; + + switch (skb->nh.iph->protocol) { + case IPPROTO_TCP: + ports[0] = ptr_u->th.source; + ports[1] = ptr_u->th.dest; + break; + case IPPROTO_UDP: + ports[0] = ptr_u->uh.source; + ports[1] = ptr_u->uh.dest; + break; + case IPPROTO_SCTP: + ports[0] = ptr_u->sctph.source; + ports[1] = ptr_u->sctph.dest; + break; + default: + /* all other protocols don't supprot per-port hash + * buckets */ + ports[0] = ports[1] = 0; + break; + } + + return 0; +} + + +static int +hashlimit_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct ipt_hashlimit_info *r = + ((struct ipt_hashlimit_info *)matchinfo)->u.master; + struct ipt_hashlimit_htable *hinfo = r->hinfo; + unsigned long now = jiffies; + struct dsthash_ent *dh; + struct dsthash_dst dst; + + /* build 'dst' according to hinfo->cfg and current packet */ + memset(&dst, 0, sizeof(dst)); + if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DIP) + dst.dst_ip = skb->nh.iph->daddr; + if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SIP) + dst.src_ip = skb->nh.iph->saddr; + if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT + ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) { + u_int16_t ports[2]; + if (get_ports(skb, offset, ports)) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + *hotdrop = 1; + return 0; + } + if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) + dst.src_port = ports[0]; + if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT) + dst.dst_port = ports[1]; + } + + spin_lock_bh(&hinfo->lock); + dh = __dsthash_find(hinfo, &dst); + if (!dh) { + dh = __dsthash_alloc_init(hinfo, &dst); + + if (!dh) { + /* enomem... don't match == DROP */ + if (net_ratelimit()) + printk(KERN_ERR "%s: ENOMEM\n", __FUNCTION__); + spin_unlock_bh(&hinfo->lock); + return 0; + } + + dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire); + + dh->rateinfo.prev = jiffies; + dh->rateinfo.credit = user2credits(hinfo->cfg.avg * + hinfo->cfg.burst); + dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg * + hinfo->cfg.burst); + dh->rateinfo.cost = user2credits(hinfo->cfg.avg); + + spin_unlock_bh(&hinfo->lock); + return 1; + } + + /* update expiration timeout */ + dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); + + rateinfo_recalc(dh, now); + if (dh->rateinfo.credit >= dh->rateinfo.cost) { + /* We're underlimit. */ + dh->rateinfo.credit -= dh->rateinfo.cost; + spin_unlock_bh(&hinfo->lock); + return 1; + } + + spin_unlock_bh(&hinfo->lock); + + /* default case: we're overlimit, thus don't match */ + return 0; +} + +static int +hashlimit_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + struct ipt_hashlimit_info *r = matchinfo; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_hashlimit_info))) + return 0; + + /* Check for overflow. */ + if (r->cfg.burst == 0 + || user2credits(r->cfg.avg * r->cfg.burst) < + user2credits(r->cfg.avg)) { + printk(KERN_ERR "ipt_hashlimit: Overflow, try lower: %u/%u\n", + r->cfg.avg, r->cfg.burst); + return 0; + } + + if (r->cfg.mode == 0 + || r->cfg.mode > (IPT_HASHLIMIT_HASH_DPT + |IPT_HASHLIMIT_HASH_DIP + |IPT_HASHLIMIT_HASH_SIP + |IPT_HASHLIMIT_HASH_SPT)) + return 0; + + if (!r->cfg.gc_interval) + return 0; + + if (!r->cfg.expire) + return 0; + + /* This is the best we've got: We cannot release and re-grab lock, + * since checkentry() is called before ip_tables.c grabs ipt_mutex. + * We also cannot grab the hashtable spinlock, since htable_create will + * call vmalloc, and that can sleep. And we cannot just re-search + * the list of htable's in htable_create(), since then we would + * create duplicate proc files. -HW */ + down(&hlimit_mutex); + r->hinfo = htable_find_get(r->name); + if (!r->hinfo && (htable_create(r) != 0)) { + up(&hlimit_mutex); + return 0; + } + up(&hlimit_mutex); + + /* Ugly hack: For SMP, we only want to use one set */ + r->u.master = r; + + return 1; +} + +static void +hashlimit_destroy(void *matchinfo, unsigned int matchsize) +{ + struct ipt_hashlimit_info *r = (struct ipt_hashlimit_info *) matchinfo; + + htable_put(r->hinfo); +} + +static struct ipt_match ipt_hashlimit = { + .name = "hashlimit", + .match = hashlimit_match, + .checkentry = hashlimit_checkentry, + .destroy = hashlimit_destroy, + .me = THIS_MODULE +}; + +/* PROC stuff */ + +static void *dl_seq_start(struct seq_file *s, loff_t *pos) +{ + struct proc_dir_entry *pde = s->private; + struct ipt_hashlimit_htable *htable = pde->data; + unsigned int *bucket; + + spin_lock_bh(&htable->lock); + if (*pos >= htable->cfg.size) + return NULL; + + bucket = kmalloc(sizeof(unsigned int), GFP_ATOMIC); + if (!bucket) + return ERR_PTR(-ENOMEM); + + *bucket = *pos; + return bucket; +} + +static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct proc_dir_entry *pde = s->private; + struct ipt_hashlimit_htable *htable = pde->data; + unsigned int *bucket = (unsigned int *)v; + + *pos = ++(*bucket); + if (*pos >= htable->cfg.size) { + kfree(v); + return NULL; + } + return bucket; +} + +static void dl_seq_stop(struct seq_file *s, void *v) +{ + struct proc_dir_entry *pde = s->private; + struct ipt_hashlimit_htable *htable = pde->data; + unsigned int *bucket = (unsigned int *)v; + + kfree(bucket); + + spin_unlock_bh(&htable->lock); +} + +static inline int dl_seq_real_show(struct dsthash_ent *ent, struct seq_file *s) +{ + /* recalculate to show accurate numbers */ + rateinfo_recalc(ent, jiffies); + + return seq_printf(s, "%ld %u.%u.%u.%u:%u->%u.%u.%u.%u:%u %u %u %u\n", + (long)(ent->expires - jiffies)/HZ, + NIPQUAD(ent->dst.src_ip), ntohs(ent->dst.src_port), + NIPQUAD(ent->dst.dst_ip), ntohs(ent->dst.dst_port), + ent->rateinfo.credit, ent->rateinfo.credit_cap, + ent->rateinfo.cost); +} + +static int dl_seq_show(struct seq_file *s, void *v) +{ + struct proc_dir_entry *pde = s->private; + struct ipt_hashlimit_htable *htable = pde->data; + unsigned int *bucket = (unsigned int *)v; + struct dsthash_ent *ent; + struct hlist_node *pos; + + if (!hlist_empty(&htable->hash[*bucket])) + hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node) { + if (dl_seq_real_show(ent, s)) { + /* buffer was filled and unable to print that tuple */ + return 1; + } + } + + return 0; +} + +static struct seq_operations dl_seq_ops = { + .start = dl_seq_start, + .next = dl_seq_next, + .stop = dl_seq_stop, + .show = dl_seq_show +}; + +static int dl_proc_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &dl_seq_ops); + + if (!ret) { + struct seq_file *sf = file->private_data; + sf->private = PDE(inode); + } + return ret; +} + +static struct file_operations dl_file_ops = { + .owner = THIS_MODULE, + .open = dl_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static int init_or_fini(int fini) +{ + int ret = 0; + + if (fini) + goto cleanup; + + if (ipt_register_match(&ipt_hashlimit)) { + ret = -EINVAL; + goto cleanup_nothing; + } + + hashlimit_cachep = kmem_cache_create("ipt_hashlimit", + sizeof(struct dsthash_ent), 0, + 0, NULL, NULL); + if (!hashlimit_cachep) { + printk(KERN_ERR "Unable to create ipt_hashlimit slab cache\n"); + ret = -ENOMEM; + goto cleanup_unreg_match; + } + + hashlimit_procdir = proc_mkdir("ipt_hashlimit", proc_net); + if (!hashlimit_procdir) { + printk(KERN_ERR "Unable to create proc dir entry\n"); + ret = -ENOMEM; + goto cleanup_free_slab; + } + + return ret; + +cleanup: + remove_proc_entry("ipt_hashlimit", proc_net); +cleanup_free_slab: + kmem_cache_destroy(hashlimit_cachep); +cleanup_unreg_match: + ipt_unregister_match(&ipt_hashlimit); +cleanup_nothing: + return ret; + +} + +static int __init init(void) +{ + return init_or_fini(0); +} + +static void __exit fini(void) +{ + init_or_fini(1); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c new file mode 100644 index 000000000000..33fdf364d3d3 --- /dev/null +++ b/net/ipv4/netfilter/ipt_helper.c @@ -0,0 +1,113 @@ +/* iptables module to match on related connections */ +/* + * (C) 2001 Martin Josefsson <gandalf@wlug.westbo.se> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 19 Mar 2002 Harald Welte <laforge@gnumonks.org>: + * - Port to newnat infrastructure + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_conntrack_core.h> +#include <linux/netfilter_ipv4/ip_conntrack_helper.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_helper.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>"); +MODULE_DESCRIPTION("iptables helper match module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_helper_info *info = matchinfo; + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + int ret = info->invert; + + ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); + if (!ct) { + DEBUGP("ipt_helper: Eek! invalid conntrack?\n"); + return ret; + } + + if (!ct->master) { + DEBUGP("ipt_helper: conntrack %p has no master\n", ct); + return ret; + } + + READ_LOCK(&ip_conntrack_lock); + if (!ct->master->helper) { + DEBUGP("ipt_helper: master ct %p has no helper\n", + exp->expectant); + goto out_unlock; + } + + DEBUGP("master's name = %s , info->name = %s\n", + ct->master->helper->name, info->name); + + if (info->name[0] == '\0') + ret ^= 1; + else + ret ^= !strncmp(ct->master->helper->name, info->name, + strlen(ct->master->helper->name)); +out_unlock: + READ_UNLOCK(&ip_conntrack_lock); + return ret; +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + struct ipt_helper_info *info = matchinfo; + + info->name[29] = '\0'; + + /* verify size */ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_helper_info))) + return 0; + + return 1; +} + +static struct ipt_match helper_match = { + .name = "helper", + .match = &match, + .checkentry = &check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + need_ip_conntrack(); + return ipt_register_match(&helper_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&helper_match); +} + +module_init(init); +module_exit(fini); + diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c new file mode 100644 index 000000000000..b835b7b2e560 --- /dev/null +++ b/net/ipv4/netfilter/ipt_iprange.c @@ -0,0 +1,99 @@ +/* + * iptables module to match IP address ranges + * + * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_iprange.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); +MODULE_DESCRIPTION("iptables arbitrary IP range match module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, int *hotdrop) +{ + const struct ipt_iprange_info *info = matchinfo; + const struct iphdr *iph = skb->nh.iph; + + if (info->flags & IPRANGE_SRC) { + if (((ntohl(iph->saddr) < ntohl(info->src.min_ip)) + || (ntohl(iph->saddr) > ntohl(info->src.max_ip))) + ^ !!(info->flags & IPRANGE_SRC_INV)) { + DEBUGP("src IP %u.%u.%u.%u NOT in range %s" + "%u.%u.%u.%u-%u.%u.%u.%u\n", + NIPQUAD(iph->saddr), + info->flags & IPRANGE_SRC_INV ? "(INV) " : "", + NIPQUAD(info->src.min_ip), + NIPQUAD(info->src.max_ip)); + return 0; + } + } + if (info->flags & IPRANGE_DST) { + if (((ntohl(iph->daddr) < ntohl(info->dst.min_ip)) + || (ntohl(iph->daddr) > ntohl(info->dst.max_ip))) + ^ !!(info->flags & IPRANGE_DST_INV)) { + DEBUGP("dst IP %u.%u.%u.%u NOT in range %s" + "%u.%u.%u.%u-%u.%u.%u.%u\n", + NIPQUAD(iph->daddr), + info->flags & IPRANGE_DST_INV ? "(INV) " : "", + NIPQUAD(info->dst.min_ip), + NIPQUAD(info->dst.max_ip)); + return 0; + } + } + return 1; +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + /* verify size */ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_iprange_info))) + return 0; + + return 1; +} + +static struct ipt_match iprange_match = +{ + .list = { NULL, NULL }, + .name = "iprange", + .match = &match, + .checkentry = &check, + .destroy = NULL, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&iprange_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&iprange_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_length.c b/net/ipv4/netfilter/ipt_length.c new file mode 100644 index 000000000000..4eabcfbda9d1 --- /dev/null +++ b/net/ipv4/netfilter/ipt_length.c @@ -0,0 +1,64 @@ +/* Kernel module to match packet length. */ +/* (C) 1999-2001 James Morris <jmorros@intercode.com.au> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv4/ipt_length.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); +MODULE_DESCRIPTION("IP tables packet length matching module"); +MODULE_LICENSE("GPL"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_length_info *info = matchinfo; + u_int16_t pktlen = ntohs(skb->nh.iph->tot_len); + + return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_length_info))) + return 0; + + return 1; +} + +static struct ipt_match length_match = { + .name = "length", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&length_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&length_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_limit.c b/net/ipv4/netfilter/ipt_limit.c new file mode 100644 index 000000000000..0c24dcc703a5 --- /dev/null +++ b/net/ipv4/netfilter/ipt_limit.c @@ -0,0 +1,157 @@ +/* Kernel module to control the rate + * + * 2 September 1999: Changed from the target RATE to the match + * `limit', removed logging. Did I mention that + * Alexey is a fucking genius? + * Rusty Russell (rusty@rustcorp.com.au). */ + +/* (C) 1999 Jérôme de Vivie <devivie@info.enserb.u-bordeaux.fr> + * (C) 1999 Hervé Eychenne <eychenne@info.enserb.u-bordeaux.fr> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_limit.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>"); +MODULE_DESCRIPTION("iptables rate limit match"); + +/* The algorithm used is the Simple Token Bucket Filter (TBF) + * see net/sched/sch_tbf.c in the linux source tree + */ + +static DEFINE_SPINLOCK(limit_lock); + +/* Rusty: This is my (non-mathematically-inclined) understanding of + this algorithm. The `average rate' in jiffies becomes your initial + amount of credit `credit' and the most credit you can ever have + `credit_cap'. The `peak rate' becomes the cost of passing the + test, `cost'. + + `prev' tracks the last packet hit: you gain one credit per jiffy. + If you get credit balance more than this, the extra credit is + discarded. Every time the match passes, you lose `cost' credits; + if you don't have that many, the test fails. + + See Alexey's formal explanation in net/sched/sch_tbf.c. + + To get the maxmum range, we multiply by this factor (ie. you get N + credits per jiffy). We want to allow a rate as low as 1 per day + (slowest userspace tool allows), which means + CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32. ie. */ +#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24)) + +/* Repeated shift and or gives us all 1s, final shift and add 1 gives + * us the power of 2 below the theoretical max, so GCC simply does a + * shift. */ +#define _POW2_BELOW2(x) ((x)|((x)>>1)) +#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2)) +#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4)) +#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8)) +#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) +#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) + +#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) + +static int +ipt_limit_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct ipt_rateinfo *r = ((struct ipt_rateinfo *)matchinfo)->master; + unsigned long now = jiffies; + + spin_lock_bh(&limit_lock); + r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY; + if (r->credit > r->credit_cap) + r->credit = r->credit_cap; + + if (r->credit >= r->cost) { + /* We're not limited. */ + r->credit -= r->cost; + spin_unlock_bh(&limit_lock); + return 1; + } + + spin_unlock_bh(&limit_lock); + return 0; +} + +/* Precision saver. */ +static u_int32_t +user2credits(u_int32_t user) +{ + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + /* Divide first. */ + return (user / IPT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + + return (user * HZ * CREDITS_PER_JIFFY) / IPT_LIMIT_SCALE; +} + +static int +ipt_limit_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + struct ipt_rateinfo *r = matchinfo; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_rateinfo))) + return 0; + + /* Check for overflow. */ + if (r->burst == 0 + || user2credits(r->avg * r->burst) < user2credits(r->avg)) { + printk("Overflow in ipt_limit, try lower: %u/%u\n", + r->avg, r->burst); + return 0; + } + + /* User avg in seconds * IPT_LIMIT_SCALE: convert to jiffies * + 128. */ + r->prev = jiffies; + r->credit = user2credits(r->avg * r->burst); /* Credits full. */ + r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */ + r->cost = user2credits(r->avg); + + /* For SMP, we only want to use one set of counters. */ + r->master = r; + + return 1; +} + +static struct ipt_match ipt_limit_reg = { + .name = "limit", + .match = ipt_limit_match, + .checkentry = ipt_limit_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + if (ipt_register_match(&ipt_limit_reg)) + return -EINVAL; + return 0; +} + +static void __exit fini(void) +{ + ipt_unregister_match(&ipt_limit_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_mac.c b/net/ipv4/netfilter/ipt_mac.c new file mode 100644 index 000000000000..11a459e33f25 --- /dev/null +++ b/net/ipv4/netfilter/ipt_mac.c @@ -0,0 +1,79 @@ +/* Kernel module to match MAC address parameters. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/if_ether.h> + +#include <linux/netfilter_ipv4/ipt_mac.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("iptables mac matching module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_mac_info *info = matchinfo; + + /* Is mac pointer valid? */ + return (skb->mac.raw >= skb->head + && (skb->mac.raw + ETH_HLEN) <= skb->data + /* If so, compare... */ + && ((memcmp(eth_hdr(skb)->h_source, info->srcaddr, ETH_ALEN) + == 0) ^ info->invert)); +} + +static int +ipt_mac_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + /* FORWARD isn't always valid, but it's nice to be able to do --RR */ + if (hook_mask + & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN) + | (1 << NF_IP_FORWARD))) { + printk("ipt_mac: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n"); + return 0; + } + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_mac_info))) + return 0; + + return 1; +} + +static struct ipt_match mac_match = { + .name = "mac", + .match = &match, + .checkentry = &ipt_mac_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&mac_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&mac_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c new file mode 100644 index 000000000000..8955728127b9 --- /dev/null +++ b/net/ipv4/netfilter/ipt_mark.c @@ -0,0 +1,64 @@ +/* Kernel module to match NFMARK values. */ + +/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv4/ipt_mark.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +MODULE_DESCRIPTION("iptables mark matching module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_mark_info *info = matchinfo; + + return ((skb->nfmark & info->mask) == info->mark) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info))) + return 0; + + return 1; +} + +static struct ipt_match mark_match = { + .name = "mark", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&mark_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&mark_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_multiport.c b/net/ipv4/netfilter/ipt_multiport.c new file mode 100644 index 000000000000..99e8188162e2 --- /dev/null +++ b/net/ipv4/netfilter/ipt_multiport.c @@ -0,0 +1,212 @@ +/* Kernel module to match one of a list of TCP/UDP ports: ports are in + the same place so we can treat them as equal. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/udp.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv4/ipt_multiport.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("iptables multiple port match module"); + +#if 0 +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +/* Returns 1 if the port is matched by the test, 0 otherwise. */ +static inline int +ports_match(const u_int16_t *portlist, enum ipt_multiport_flags flags, + u_int8_t count, u_int16_t src, u_int16_t dst) +{ + unsigned int i; + for (i=0; i<count; i++) { + if (flags != IPT_MULTIPORT_DESTINATION + && portlist[i] == src) + return 1; + + if (flags != IPT_MULTIPORT_SOURCE + && portlist[i] == dst) + return 1; + } + + return 0; +} + +/* Returns 1 if the port is matched by the test, 0 otherwise. */ +static inline int +ports_match_v1(const struct ipt_multiport_v1 *minfo, + u_int16_t src, u_int16_t dst) +{ + unsigned int i; + u_int16_t s, e; + + for (i=0; i < minfo->count; i++) { + s = minfo->ports[i]; + + if (minfo->pflags[i]) { + /* range port matching */ + e = minfo->ports[++i]; + duprintf("src or dst matches with %d-%d?\n", s, e); + + if (minfo->flags == IPT_MULTIPORT_SOURCE + && src >= s && src <= e) + return 1 ^ minfo->invert; + if (minfo->flags == IPT_MULTIPORT_DESTINATION + && dst >= s && dst <= e) + return 1 ^ minfo->invert; + if (minfo->flags == IPT_MULTIPORT_EITHER + && ((dst >= s && dst <= e) + || (src >= s && src <= e))) + return 1 ^ minfo->invert; + } else { + /* exact port matching */ + duprintf("src or dst matches with %d?\n", s); + + if (minfo->flags == IPT_MULTIPORT_SOURCE + && src == s) + return 1 ^ minfo->invert; + if (minfo->flags == IPT_MULTIPORT_DESTINATION + && dst == s) + return 1 ^ minfo->invert; + if (minfo->flags == IPT_MULTIPORT_EITHER + && (src == s || dst == s)) + return 1 ^ minfo->invert; + } + } + + return minfo->invert; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + u16 _ports[2], *pptr; + const struct ipt_multiport *multiinfo = matchinfo; + + if (offset) + return 0; + + pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_ports), _ports); + if (pptr == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("ipt_multiport:" + " Dropping evil offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return ports_match(multiinfo->ports, + multiinfo->flags, multiinfo->count, + ntohs(pptr[0]), ntohs(pptr[1])); +} + +static int +match_v1(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + u16 _ports[2], *pptr; + const struct ipt_multiport_v1 *multiinfo = matchinfo; + + if (offset) + return 0; + + pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_ports), _ports); + if (pptr == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("ipt_multiport:" + " Dropping evil offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return ports_match_v1(multiinfo, ntohs(pptr[0]), ntohs(pptr[1])); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + return (matchsize == IPT_ALIGN(sizeof(struct ipt_multiport))); +} + +static int +checkentry_v1(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + return (matchsize == IPT_ALIGN(sizeof(struct ipt_multiport_v1))); +} + +static struct ipt_match multiport_match = { + .name = "multiport", + .revision = 0, + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static struct ipt_match multiport_match_v1 = { + .name = "multiport", + .revision = 1, + .match = &match_v1, + .checkentry = &checkentry_v1, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + int err; + + err = ipt_register_match(&multiport_match); + if (!err) { + err = ipt_register_match(&multiport_match_v1); + if (err) + ipt_unregister_match(&multiport_match); + } + + return err; +} + +static void __exit fini(void) +{ + ipt_unregister_match(&multiport_match); + ipt_unregister_match(&multiport_match_v1); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c new file mode 100644 index 000000000000..3b9065e06381 --- /dev/null +++ b/net/ipv4/netfilter/ipt_owner.c @@ -0,0 +1,217 @@ +/* Kernel module to match various things tied to sockets associated with + locally generated outgoing packets. */ + +/* (C) 2000 Marc Boucher <marc@mbsi.ca> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/file.h> +#include <net/sock.h> + +#include <linux/netfilter_ipv4/ipt_owner.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +MODULE_DESCRIPTION("iptables owner match"); + +static int +match_comm(const struct sk_buff *skb, const char *comm) +{ + struct task_struct *g, *p; + struct files_struct *files; + int i; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if(strncmp(p->comm, comm, sizeof(p->comm))) + continue; + + task_lock(p); + files = p->files; + if(files) { + spin_lock(&files->file_lock); + for (i=0; i < files->max_fds; i++) { + if (fcheck_files(files, i) == + skb->sk->sk_socket->file) { + spin_unlock(&files->file_lock); + task_unlock(p); + read_unlock(&tasklist_lock); + return 1; + } + } + spin_unlock(&files->file_lock); + } + task_unlock(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + return 0; +} + +static int +match_pid(const struct sk_buff *skb, pid_t pid) +{ + struct task_struct *p; + struct files_struct *files; + int i; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if (!p) + goto out; + task_lock(p); + files = p->files; + if(files) { + spin_lock(&files->file_lock); + for (i=0; i < files->max_fds; i++) { + if (fcheck_files(files, i) == + skb->sk->sk_socket->file) { + spin_unlock(&files->file_lock); + task_unlock(p); + read_unlock(&tasklist_lock); + return 1; + } + } + spin_unlock(&files->file_lock); + } + task_unlock(p); +out: + read_unlock(&tasklist_lock); + return 0; +} + +static int +match_sid(const struct sk_buff *skb, pid_t sid) +{ + struct task_struct *g, *p; + struct file *file = skb->sk->sk_socket->file; + int i, found=0; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + struct files_struct *files; + if (p->signal->session != sid) + continue; + + task_lock(p); + files = p->files; + if (files) { + spin_lock(&files->file_lock); + for (i=0; i < files->max_fds; i++) { + if (fcheck_files(files, i) == file) { + found = 1; + break; + } + } + spin_unlock(&files->file_lock); + } + task_unlock(p); + if (found) + goto out; + } while_each_thread(g, p); +out: + read_unlock(&tasklist_lock); + + return found; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_owner_info *info = matchinfo; + + if (!skb->sk || !skb->sk->sk_socket || !skb->sk->sk_socket->file) + return 0; + + if(info->match & IPT_OWNER_UID) { + if ((skb->sk->sk_socket->file->f_uid != info->uid) ^ + !!(info->invert & IPT_OWNER_UID)) + return 0; + } + + if(info->match & IPT_OWNER_GID) { + if ((skb->sk->sk_socket->file->f_gid != info->gid) ^ + !!(info->invert & IPT_OWNER_GID)) + return 0; + } + + if(info->match & IPT_OWNER_PID) { + if (!match_pid(skb, info->pid) ^ + !!(info->invert & IPT_OWNER_PID)) + return 0; + } + + if(info->match & IPT_OWNER_SID) { + if (!match_sid(skb, info->sid) ^ + !!(info->invert & IPT_OWNER_SID)) + return 0; + } + + if(info->match & IPT_OWNER_COMM) { + if (!match_comm(skb, info->comm) ^ + !!(info->invert & IPT_OWNER_COMM)) + return 0; + } + + return 1; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (hook_mask + & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) { + printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n"); + return 0; + } + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_owner_info))) { + printk("Matchsize %u != %Zu\n", matchsize, + IPT_ALIGN(sizeof(struct ipt_owner_info))); + return 0; + } +#ifdef CONFIG_SMP + /* files->file_lock can not be used in a BH */ + if (((struct ipt_owner_info *)matchinfo)->match + & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) { + printk("ipt_owner: pid, sid and command matching is broken " + "on SMP.\n"); + return 0; + } +#endif + return 1; +} + +static struct ipt_match owner_match = { + .name = "owner", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&owner_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&owner_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c new file mode 100644 index 000000000000..1a53924041fc --- /dev/null +++ b/net/ipv4/netfilter/ipt_physdev.c @@ -0,0 +1,134 @@ +/* Kernel module to match the bridge port in and + * out device for IP packets coming into contact with a bridge. */ + +/* (C) 2001-2003 Bart De Schuymer <bdschuym@pandora.be> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv4/ipt_physdev.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_bridge.h> +#define MATCH 1 +#define NOMATCH 0 + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>"); +MODULE_DESCRIPTION("iptables bridge physical device match module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + int i; + static const char nulldevname[IFNAMSIZ]; + const struct ipt_physdev_info *info = matchinfo; + unsigned int ret; + const char *indev, *outdev; + struct nf_bridge_info *nf_bridge; + + /* Not a bridged IP packet or no info available yet: + * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if + * the destination device will be a bridge. */ + if (!(nf_bridge = skb->nf_bridge)) { + /* Return MATCH if the invert flags of the used options are on */ + if ((info->bitmask & IPT_PHYSDEV_OP_BRIDGED) && + !(info->invert & IPT_PHYSDEV_OP_BRIDGED)) + return NOMATCH; + if ((info->bitmask & IPT_PHYSDEV_OP_ISIN) && + !(info->invert & IPT_PHYSDEV_OP_ISIN)) + return NOMATCH; + if ((info->bitmask & IPT_PHYSDEV_OP_ISOUT) && + !(info->invert & IPT_PHYSDEV_OP_ISOUT)) + return NOMATCH; + if ((info->bitmask & IPT_PHYSDEV_OP_IN) && + !(info->invert & IPT_PHYSDEV_OP_IN)) + return NOMATCH; + if ((info->bitmask & IPT_PHYSDEV_OP_OUT) && + !(info->invert & IPT_PHYSDEV_OP_OUT)) + return NOMATCH; + return MATCH; + } + + /* This only makes sense in the FORWARD and POSTROUTING chains */ + if ((info->bitmask & IPT_PHYSDEV_OP_BRIDGED) && + (!!(nf_bridge->mask & BRNF_BRIDGED) ^ + !(info->invert & IPT_PHYSDEV_OP_BRIDGED))) + return NOMATCH; + + if ((info->bitmask & IPT_PHYSDEV_OP_ISIN && + (!nf_bridge->physindev ^ !!(info->invert & IPT_PHYSDEV_OP_ISIN))) || + (info->bitmask & IPT_PHYSDEV_OP_ISOUT && + (!nf_bridge->physoutdev ^ !!(info->invert & IPT_PHYSDEV_OP_ISOUT)))) + return NOMATCH; + + if (!(info->bitmask & IPT_PHYSDEV_OP_IN)) + goto match_outdev; + indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname; + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) { + ret |= (((const unsigned int *)indev)[i] + ^ ((const unsigned int *)info->physindev)[i]) + & ((const unsigned int *)info->in_mask)[i]; + } + + if ((ret == 0) ^ !(info->invert & IPT_PHYSDEV_OP_IN)) + return NOMATCH; + +match_outdev: + if (!(info->bitmask & IPT_PHYSDEV_OP_OUT)) + return MATCH; + outdev = nf_bridge->physoutdev ? + nf_bridge->physoutdev->name : nulldevname; + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) { + ret |= (((const unsigned int *)outdev)[i] + ^ ((const unsigned int *)info->physoutdev)[i]) + & ((const unsigned int *)info->out_mask)[i]; + } + + return (ret != 0) ^ !(info->invert & IPT_PHYSDEV_OP_OUT); +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_physdev_info *info = matchinfo; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_physdev_info))) + return 0; + if (!(info->bitmask & IPT_PHYSDEV_OP_MASK) || + info->bitmask & ~IPT_PHYSDEV_OP_MASK) + return 0; + return 1; +} + +static struct ipt_match physdev_match = { + .name = "physdev", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&physdev_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&physdev_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_pkttype.c b/net/ipv4/netfilter/ipt_pkttype.c new file mode 100644 index 000000000000..8ddb1dc5e5ae --- /dev/null +++ b/net/ipv4/netfilter/ipt_pkttype.c @@ -0,0 +1,70 @@ +/* (C) 1999-2001 Michal Ludvig <michal@logix.cz> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/if_ether.h> +#include <linux/if_packet.h> + +#include <linux/netfilter_ipv4/ipt_pkttype.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Michal Ludvig <michal@logix.cz>"); +MODULE_DESCRIPTION("IP tables match to match on linklayer packet type"); + +static int match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_pkttype_info *info = matchinfo; + + return (skb->pkt_type == info->pkttype) ^ info->invert; +} + +static int checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ +/* + if (hook_mask + & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN) + | (1 << NF_IP_FORWARD))) { + printk("ipt_pkttype: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n"); + return 0; + } +*/ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_pkttype_info))) + return 0; + + return 1; +} + +static struct ipt_match pkttype_match = { + .name = "pkttype", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&pkttype_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&pkttype_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_realm.c b/net/ipv4/netfilter/ipt_realm.c new file mode 100644 index 000000000000..54a6897ebaa6 --- /dev/null +++ b/net/ipv4/netfilter/ipt_realm.c @@ -0,0 +1,76 @@ +/* IP tables module for matching the routing realm + * + * $Id: ipt_realm.c,v 1.3 2004/03/05 13:25:40 laforge Exp $ + * + * (C) 2003 by Sampsa Ranta <sampsa@netsonic.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netdevice.h> +#include <net/route.h> + +#include <linux/netfilter_ipv4/ipt_realm.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_AUTHOR("Sampsa Ranta <sampsa@netsonic.fi>"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("iptables realm match"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_realm_info *info = matchinfo; + struct dst_entry *dst = skb->dst; + + return (info->id == (dst->tclassid & info->mask)) ^ info->invert; +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (hook_mask + & ~((1 << NF_IP_POST_ROUTING) | (1 << NF_IP_FORWARD) | + (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_LOCAL_IN))) { + printk("ipt_realm: only valid for POST_ROUTING, LOCAL_OUT, " + "LOCAL_IN or FORWARD.\n"); + return 0; + } + if (matchsize != IPT_ALIGN(sizeof(struct ipt_realm_info))) { + printk("ipt_realm: invalid matchsize.\n"); + return 0; + } + return 1; +} + +static struct ipt_match realm_match = { + .name = "realm", + .match = match, + .checkentry = check, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&realm_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&realm_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c new file mode 100644 index 000000000000..25ab9fabdcba --- /dev/null +++ b/net/ipv4/netfilter/ipt_recent.c @@ -0,0 +1,1002 @@ +/* Kernel module to check if the source address has been seen recently. */ +/* Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org */ +/* Author: Stephen Frost <sfrost@snowman.net> */ +/* Project Page: http://snowman.net/projects/ipt_recent/ */ +/* This software is distributed under the terms of the GPL, Version 2 */ +/* This copyright does not cover user programs that use kernel services + * by normal system calls. */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/spinlock.h> +#include <linux/interrupt.h> +#include <asm/uaccess.h> +#include <linux/ctype.h> +#include <linux/ip.h> +#include <linux/vmalloc.h> +#include <linux/moduleparam.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_recent.h> + +#undef DEBUG +#define HASH_LOG 9 + +/* Defaults, these can be overridden on the module command-line. */ +static int ip_list_tot = 100; +static int ip_pkt_list_tot = 20; +static int ip_list_hash_size = 0; +static int ip_list_perms = 0644; +#ifdef DEBUG +static int debug = 1; +#endif + +static char version[] = +KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost <sfrost@snowman.net>. http://snowman.net/projects/ipt_recent/\n"; + +MODULE_AUTHOR("Stephen Frost <sfrost@snowman.net>"); +MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER); +MODULE_LICENSE("GPL"); +module_param(ip_list_tot, int, 0400); +module_param(ip_pkt_list_tot, int, 0400); +module_param(ip_list_hash_size, int, 0400); +module_param(ip_list_perms, int, 0400); +#ifdef DEBUG +module_param(debug, int, 0600); +MODULE_PARM_DESC(debug,"debugging level, defaults to 1"); +#endif +MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list"); +MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember"); +MODULE_PARM_DESC(ip_list_hash_size,"size of hash table used to look up IPs"); +MODULE_PARM_DESC(ip_list_perms,"permissions on /proc/net/ipt_recent/* files"); + +/* Structure of our list of recently seen addresses. */ +struct recent_ip_list { + u_int32_t addr; + u_int8_t ttl; + unsigned long last_seen; + unsigned long *last_pkts; + u_int32_t oldest_pkt; + u_int32_t hash_entry; + u_int32_t time_pos; +}; + +struct time_info_list { + u_int32_t position; + u_int32_t time; +}; + +/* Structure of our linked list of tables of recent lists. */ +struct recent_ip_tables { + char name[IPT_RECENT_NAME_LEN]; + int count; + int time_pos; + struct recent_ip_list *table; + struct recent_ip_tables *next; + spinlock_t list_lock; + int *hash_table; + struct time_info_list *time_info; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *status_proc; +#endif /* CONFIG_PROC_FS */ +}; + +/* Our current list of addresses we have recently seen. + * Only added to on a --set, and only updated on --set || --update + */ +static struct recent_ip_tables *r_tables = NULL; + +/* We protect r_list with this spinlock so two processors are not modifying + * the list at the same time. + */ +static DEFINE_SPINLOCK(recent_lock); + +#ifdef CONFIG_PROC_FS +/* Our /proc/net/ipt_recent entry */ +static struct proc_dir_entry *proc_net_ipt_recent = NULL; +#endif + +/* Function declaration for later. */ +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop); + +/* Function to hash a given address into the hash table of table_size size */ +static int hash_func(unsigned int addr, int table_size) +{ + int result = 0; + unsigned int value = addr; + do { result ^= value; } while((value >>= HASH_LOG)); + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": %d = hash_func(%u,%d)\n", + result & (table_size - 1), + addr, + table_size); +#endif + + return(result & (table_size - 1)); +} + +#ifdef CONFIG_PROC_FS +/* This is the function which produces the output for our /proc output + * interface which lists each IP address, the last seen time and the + * other recent times the address was seen. + */ + +static int ip_recent_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data) +{ + int len = 0, count, last_len = 0, pkt_count; + off_t pos = 0; + off_t begin = 0; + struct recent_ip_tables *curr_table; + + curr_table = (struct recent_ip_tables*) data; + + spin_lock_bh(&curr_table->list_lock); + for(count = 0; count < ip_list_tot; count++) { + if(!curr_table->table[count].addr) continue; + last_len = len; + len += sprintf(buffer+len,"src=%u.%u.%u.%u ",NIPQUAD(curr_table->table[count].addr)); + len += sprintf(buffer+len,"ttl: %u ",curr_table->table[count].ttl); + len += sprintf(buffer+len,"last_seen: %lu ",curr_table->table[count].last_seen); + len += sprintf(buffer+len,"oldest_pkt: %u ",curr_table->table[count].oldest_pkt); + len += sprintf(buffer+len,"last_pkts: %lu",curr_table->table[count].last_pkts[0]); + for(pkt_count = 1; pkt_count < ip_pkt_list_tot; pkt_count++) { + if(!curr_table->table[count].last_pkts[pkt_count]) break; + len += sprintf(buffer+len,", %lu",curr_table->table[count].last_pkts[pkt_count]); + } + len += sprintf(buffer+len,"\n"); + pos = begin + len; + if(pos < offset) { len = 0; begin = pos; } + if(pos > offset + length) { len = last_len; break; } + } + + *start = buffer + (offset - begin); + len -= (offset - begin); + if(len > length) len = length; + + spin_unlock_bh(&curr_table->list_lock); + return len; +} + +/* ip_recent_ctrl provides an interface for users to modify the table + * directly. This allows adding entries, removing entries, and + * flushing the entire table. + * This is done by opening up the appropriate table for writing and + * sending one of: + * xx.xx.xx.xx -- Add entry to table with current time + * +xx.xx.xx.xx -- Add entry to table with current time + * -xx.xx.xx.xx -- Remove entry from table + * clear -- Flush table, remove all entries + */ + +static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned long size, void *data) +{ + static const u_int32_t max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff }; + u_int32_t val; + int base, used = 0; + char c, *cp; + union iaddr { + uint8_t bytes[4]; + uint32_t word; + } res; + uint8_t *pp = res.bytes; + int digit; + + char buffer[20]; + int len, check_set = 0, count; + u_int32_t addr = 0; + struct sk_buff *skb; + struct ipt_recent_info *info; + struct recent_ip_tables *curr_table; + + curr_table = (struct recent_ip_tables*) data; + + if(size > 20) len = 20; else len = size; + + if(copy_from_user(buffer,input,len)) return -EFAULT; + + if(len < 20) buffer[len] = '\0'; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl len: %d, input: `%.20s'\n",len,buffer); +#endif + + cp = buffer; + while(isspace(*cp)) { cp++; used++; if(used >= len-5) return used; } + + /* Check if we are asked to flush the entire table */ + if(!memcmp(cp,"clear",5)) { + used += 5; + spin_lock_bh(&curr_table->list_lock); + curr_table->time_pos = 0; + for(count = 0; count < ip_list_hash_size; count++) { + curr_table->hash_table[count] = -1; + } + for(count = 0; count < ip_list_tot; count++) { + curr_table->table[count].last_seen = 0; + curr_table->table[count].addr = 0; + curr_table->table[count].ttl = 0; + memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); + curr_table->table[count].oldest_pkt = 0; + curr_table->table[count].time_pos = 0; + curr_table->time_info[count].position = count; + curr_table->time_info[count].time = 0; + } + spin_unlock_bh(&curr_table->list_lock); + return used; + } + + check_set = IPT_RECENT_SET; + switch(*cp) { + case '+': check_set = IPT_RECENT_SET; cp++; used++; break; + case '-': check_set = IPT_RECENT_REMOVE; cp++; used++; break; + default: if(!isdigit(*cp)) return (used+1); break; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl cp: `%c', check_set: %d\n",*cp,check_set); +#endif + /* Get addr (effectively inet_aton()) */ + /* Shamelessly stolen from libc, a function in the kernel for doing + * this would, of course, be greatly preferred, but our options appear + * to be rather limited, so we will just do it ourselves here. + */ + res.word = 0; + + c = *cp; + for(;;) { + if(!isdigit(c)) return used; + val = 0; base = 10; digit = 0; + if(c == '0') { + c = *++cp; + if(c == 'x' || c == 'X') base = 16, c = *++cp; + else { base = 8; digit = 1; } + } + for(;;) { + if(isascii(c) && isdigit(c)) { + if(base == 8 && (c == '8' || c == '0')) return used; + val = (val * base) + (c - '0'); + c = *++cp; + digit = 1; + } else if(base == 16 && isascii(c) && isxdigit(c)) { + val = (val << 4) | (c + 10 - (islower(c) ? 'a' : 'A')); + c = *++cp; + digit = 1; + } else break; + } + if(c == '.') { + if(pp > res.bytes + 2 || val > 0xff) return used; + *pp++ = val; + c = *++cp; + } else break; + } + used = cp - buffer; + if(c != '\0' && (!isascii(c) || !isspace(c))) return used; + if(c == '\n') used++; + if(!digit) return used; + + if(val > max[pp - res.bytes]) return used; + addr = res.word | htonl(val); + + if(!addr && check_set == IPT_RECENT_SET) return used; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl c: %c, addr: %u used: %d\n",c,addr,used); +#endif + + /* Set up and just call match */ + info = kmalloc(sizeof(struct ipt_recent_info),GFP_KERNEL); + if(!info) { return -ENOMEM; } + info->seconds = 0; + info->hit_count = 0; + info->check_set = check_set; + info->invert = 0; + info->side = IPT_RECENT_SOURCE; + strncpy(info->name,curr_table->name,IPT_RECENT_NAME_LEN); + info->name[IPT_RECENT_NAME_LEN-1] = '\0'; + + skb = kmalloc(sizeof(struct sk_buff),GFP_KERNEL); + if (!skb) { + used = -ENOMEM; + goto out_free_info; + } + skb->nh.iph = kmalloc(sizeof(struct iphdr),GFP_KERNEL); + if (!skb->nh.iph) { + used = -ENOMEM; + goto out_free_skb; + } + + skb->nh.iph->saddr = addr; + skb->nh.iph->daddr = 0; + /* Clear ttl since we have no way of knowing it */ + skb->nh.iph->ttl = 0; + match(skb,NULL,NULL,info,0,NULL); + + kfree(skb->nh.iph); +out_free_skb: + kfree(skb); +out_free_info: + kfree(info); + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": Leaving ip_recent_ctrl addr: %u used: %d\n",addr,used); +#endif + return used; +} + +#endif /* CONFIG_PROC_FS */ + +/* 'match' is our primary function, called by the kernel whenever a rule is + * hit with our module as an option to it. + * What this function does depends on what was specifically asked of it by + * the user: + * --set -- Add or update last seen time of the source address of the packet + * -- matchinfo->check_set == IPT_RECENT_SET + * --rcheck -- Just check if the source address is in the list + * -- matchinfo->check_set == IPT_RECENT_CHECK + * --update -- If the source address is in the list, update last_seen + * -- matchinfo->check_set == IPT_RECENT_UPDATE + * --remove -- If the source address is in the list, remove it + * -- matchinfo->check_set == IPT_RECENT_REMOVE + * --seconds -- Option to --rcheck/--update, only match if last_seen within seconds + * -- matchinfo->seconds + * --hitcount -- Option to --rcheck/--update, only match if seen hitcount times + * -- matchinfo->hit_count + * --seconds and --hitcount can be combined + */ +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + int pkt_count, hits_found, ans; + unsigned long now; + const struct ipt_recent_info *info = matchinfo; + u_int32_t addr = 0, time_temp; + u_int8_t ttl = skb->nh.iph->ttl; + int *hash_table; + int orig_hash_result, hash_result, temp, location = 0, time_loc, end_collision_chain = -1; + struct time_info_list *time_info; + struct recent_ip_tables *curr_table; + struct recent_ip_tables *last_table; + struct recent_ip_list *r_list; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match() called\n"); +#endif + + /* Default is false ^ info->invert */ + ans = info->invert; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): name = '%s'\n",info->name); +#endif + + /* if out != NULL then routing has been done and TTL changed. + * We change it back here internally for match what came in before routing. */ + if(out) ttl++; + + /* Find the right table */ + spin_lock_bh(&recent_lock); + curr_table = r_tables; + while( (last_table = curr_table) && strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (curr_table = curr_table->next) ); + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): table found('%s')\n",info->name); +#endif + + spin_unlock_bh(&recent_lock); + + /* Table with this name not found, match impossible */ + if(!curr_table) { return ans; } + + /* Make sure no one is changing the list while we work with it */ + spin_lock_bh(&curr_table->list_lock); + + r_list = curr_table->table; + if(info->side == IPT_RECENT_DEST) addr = skb->nh.iph->daddr; else addr = skb->nh.iph->saddr; + + if(!addr) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match() address (%u) invalid, leaving.\n",addr); +#endif + spin_unlock_bh(&curr_table->list_lock); + return ans; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): checking table, addr: %u, ttl: %u, orig_ttl: %u\n",addr,ttl,skb->nh.iph->ttl); +#endif + + /* Get jiffies now in case they changed while we were waiting for a lock */ + now = jiffies; + hash_table = curr_table->hash_table; + time_info = curr_table->time_info; + + orig_hash_result = hash_result = hash_func(addr,ip_list_hash_size); + /* Hash entry at this result used */ + /* Check for TTL match if requested. If TTL is zero then a match would never + * happen, so match regardless of existing TTL in that case. Zero means the + * entry was added via the /proc interface anyway, so we will just use the + * first TTL we get for that IP address. */ + if(info->check_set & IPT_RECENT_TTL) { + while(hash_table[hash_result] != -1 && !(r_list[hash_table[hash_result]].addr == addr && + (!r_list[hash_table[hash_result]].ttl || r_list[hash_table[hash_result]].ttl == ttl))) { + /* Collision in hash table */ + hash_result = (hash_result + 1) % ip_list_hash_size; + } + } else { + while(hash_table[hash_result] != -1 && r_list[hash_table[hash_result]].addr != addr) { + /* Collision in hash table */ + hash_result = (hash_result + 1) % ip_list_hash_size; + } + } + + if(hash_table[hash_result] == -1 && !(info->check_set & IPT_RECENT_SET)) { + /* IP not in list and not asked to SET */ + spin_unlock_bh(&curr_table->list_lock); + return ans; + } + + /* Check if we need to handle the collision, do not need to on REMOVE */ + if(orig_hash_result != hash_result && !(info->check_set & IPT_RECENT_REMOVE)) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision in hash table. (or: %d,hr: %d,oa: %u,ha: %u)\n", + orig_hash_result, + hash_result, + r_list[hash_table[orig_hash_result]].addr, + addr); +#endif + + /* We had a collision. + * orig_hash_result is where we started, hash_result is where we ended up. + * So, swap them because we are likely to see the same guy again sooner */ +#ifdef DEBUG + if(debug) { + printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[orig_hash_result] = %d\n",hash_table[orig_hash_result]); + printk(KERN_INFO RECENT_NAME ": match(): Collision; r_list[hash_table[orig_hash_result]].hash_entry = %d\n", + r_list[hash_table[orig_hash_result]].hash_entry); + } +#endif + + r_list[hash_table[orig_hash_result]].hash_entry = hash_result; + + + temp = hash_table[orig_hash_result]; +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[hash_result] = %d\n",hash_table[hash_result]); +#endif + hash_table[orig_hash_result] = hash_table[hash_result]; + hash_table[hash_result] = temp; + temp = hash_result; + hash_result = orig_hash_result; + orig_hash_result = temp; + time_info[r_list[hash_table[orig_hash_result]].time_pos].position = hash_table[orig_hash_result]; + if(hash_table[hash_result] != -1) { + r_list[hash_table[hash_result]].hash_entry = hash_result; + time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision handled.\n"); +#endif + } + + if(hash_table[hash_result] == -1) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): New table entry. (hr: %d,ha: %u)\n", + hash_result, addr); +#endif + + /* New item found and IPT_RECENT_SET, so we need to add it */ + location = time_info[curr_table->time_pos].position; + hash_table[r_list[location].hash_entry] = -1; + hash_table[hash_result] = location; + memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); + r_list[location].time_pos = curr_table->time_pos; + r_list[location].addr = addr; + r_list[location].ttl = ttl; + r_list[location].last_seen = now; + r_list[location].oldest_pkt = 1; + r_list[location].last_pkts[0] = now; + r_list[location].hash_entry = hash_result; + time_info[curr_table->time_pos].time = r_list[location].last_seen; + curr_table->time_pos = (curr_table->time_pos + 1) % ip_list_tot; + + ans = !info->invert; + } else { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): Existing table entry. (hr: %d,ha: %u)\n", + hash_result, + addr); +#endif + + /* Existing item found */ + location = hash_table[hash_result]; + /* We have a match on address, now to make sure it meets all requirements for a + * full match. */ + if(info->check_set & IPT_RECENT_CHECK || info->check_set & IPT_RECENT_UPDATE) { + if(!info->seconds && !info->hit_count) ans = !info->invert; else ans = info->invert; + if(info->seconds && !info->hit_count) { + if(time_before_eq(now,r_list[location].last_seen+info->seconds*HZ)) ans = !info->invert; else ans = info->invert; + } + if(info->seconds && info->hit_count) { + for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { + if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++; + } + if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; + } + if(info->hit_count && !info->seconds) { + for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { + if(r_list[location].last_pkts[pkt_count] == 0) break; + hits_found++; + } + if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; + } + } +#ifdef DEBUG + if(debug) { + if(ans) + printk(KERN_INFO RECENT_NAME ": match(): match addr: %u\n",addr); + else + printk(KERN_INFO RECENT_NAME ": match(): no match addr: %u\n",addr); + } +#endif + + /* If and only if we have been asked to SET, or to UPDATE (on match) do we add the + * current timestamp to the last_seen. */ + if((info->check_set & IPT_RECENT_SET && (ans = !info->invert)) || (info->check_set & IPT_RECENT_UPDATE && ans)) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): SET or UPDATE; updating time info.\n"); +#endif + /* Have to update our time info */ + time_loc = r_list[location].time_pos; + time_info[time_loc].time = now; + time_info[time_loc].position = location; + while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { + time_temp = time_info[time_loc].time; + time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; + time_info[(time_loc+1)%ip_list_tot].time = time_temp; + time_temp = time_info[time_loc].position; + time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; + time_info[(time_loc+1)%ip_list_tot].position = time_temp; + r_list[time_info[time_loc].position].time_pos = time_loc; + r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; + time_loc = (time_loc+1) % ip_list_tot; + } + r_list[location].time_pos = time_loc; + r_list[location].ttl = ttl; + r_list[location].last_pkts[r_list[location].oldest_pkt] = now; + r_list[location].oldest_pkt = ++r_list[location].oldest_pkt % ip_pkt_list_tot; + r_list[location].last_seen = now; + } + /* If we have been asked to remove the entry from the list, just set it to 0 */ + if(info->check_set & IPT_RECENT_REMOVE) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; clearing entry (or: %d, hr: %d).\n",orig_hash_result,hash_result); +#endif + /* Check if this is part of a collision chain */ + while(hash_table[(orig_hash_result+1) % ip_list_hash_size] != -1) { + orig_hash_result++; + if(hash_func(r_list[hash_table[orig_hash_result]].addr,ip_list_hash_size) == hash_result) { + /* Found collision chain, how deep does this rabbit hole go? */ +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; found collision chain.\n"); +#endif + end_collision_chain = orig_hash_result; + } + } + if(end_collision_chain != -1) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; part of collision chain, moving to end.\n"); +#endif + /* Part of a collision chain, swap it with the end of the chain + * before removing. */ + r_list[hash_table[end_collision_chain]].hash_entry = hash_result; + temp = hash_table[end_collision_chain]; + hash_table[end_collision_chain] = hash_table[hash_result]; + hash_table[hash_result] = temp; + time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; + hash_result = end_collision_chain; + r_list[hash_table[hash_result]].hash_entry = hash_result; + time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; + } + location = hash_table[hash_result]; + hash_table[r_list[location].hash_entry] = -1; + time_loc = r_list[location].time_pos; + time_info[time_loc].time = 0; + time_info[time_loc].position = location; + while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { + time_temp = time_info[time_loc].time; + time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; + time_info[(time_loc+1)%ip_list_tot].time = time_temp; + time_temp = time_info[time_loc].position; + time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; + time_info[(time_loc+1)%ip_list_tot].position = time_temp; + r_list[time_info[time_loc].position].time_pos = time_loc; + r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; + time_loc = (time_loc+1) % ip_list_tot; + } + r_list[location].time_pos = time_loc; + r_list[location].last_seen = 0; + r_list[location].addr = 0; + r_list[location].ttl = 0; + memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); + r_list[location].oldest_pkt = 0; + ans = !info->invert; + } + spin_unlock_bh(&curr_table->list_lock); + return ans; + } + + spin_unlock_bh(&curr_table->list_lock); +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match() left.\n"); +#endif + return ans; +} + +/* This function is to verify that the rule given during the userspace iptables + * command is correct. + * If the command is valid then we check if the table name referred to by the + * rule exists, if not it is created. + */ +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + int flag = 0, c; + unsigned long *hold; + const struct ipt_recent_info *info = matchinfo; + struct recent_ip_tables *curr_table, *find_table, *last_table; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() entered.\n"); +#endif + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return 0; + + /* seconds and hit_count only valid for CHECK/UPDATE */ + if(info->check_set & IPT_RECENT_SET) { flag++; if(info->seconds || info->hit_count) return 0; } + if(info->check_set & IPT_RECENT_REMOVE) { flag++; if(info->seconds || info->hit_count) return 0; } + if(info->check_set & IPT_RECENT_CHECK) flag++; + if(info->check_set & IPT_RECENT_UPDATE) flag++; + + /* One and only one of these should ever be set */ + if(flag != 1) return 0; + + /* Name must be set to something */ + if(!info->name || !info->name[0]) return 0; + + /* Things look good, create a list for this if it does not exist */ + /* Lock the linked list while we play with it */ + spin_lock_bh(&recent_lock); + + /* Look for an entry with this name already created */ + /* Finds the end of the list and the entry before the end if current name does not exist */ + find_table = r_tables; + while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); + + /* If a table already exists just increment the count on that table and return */ + if(find_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), incrementing count.\n",info->name); +#endif + find_table->count++; + spin_unlock_bh(&recent_lock); + return 1; + } + + spin_unlock_bh(&recent_lock); + + /* Table with this name not found */ + /* Allocate memory for new linked list item */ + +#ifdef DEBUG + if(debug) { + printk(KERN_INFO RECENT_NAME ": checkentry: no table found (%s)\n",info->name); + printk(KERN_INFO RECENT_NAME ": checkentry: Allocationg %d for link-list entry.\n",sizeof(struct recent_ip_tables)); + } +#endif + + curr_table = vmalloc(sizeof(struct recent_ip_tables)); + if(curr_table == NULL) return 0; + + spin_lock_init(&curr_table->list_lock); + curr_table->next = NULL; + curr_table->count = 1; + curr_table->time_pos = 0; + strncpy(curr_table->name,info->name,IPT_RECENT_NAME_LEN); + curr_table->name[IPT_RECENT_NAME_LEN-1] = '\0'; + + /* Allocate memory for this table and the list of packets in each entry. */ +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for table (%s).\n", + sizeof(struct recent_ip_list)*ip_list_tot, + info->name); +#endif + + curr_table->table = vmalloc(sizeof(struct recent_ip_list)*ip_list_tot); + if(curr_table->table == NULL) { vfree(curr_table); return 0; } + memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot); +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n", + sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot); +#endif + + hold = vmalloc(sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot); +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n"); +#endif + if(hold == NULL) { + printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for pkt_list.\n"); + vfree(curr_table->table); + vfree(curr_table); + return 0; + } + for(c = 0; c < ip_list_tot; c++) { + curr_table->table[c].last_pkts = hold + c*ip_pkt_list_tot; + } + + /* Allocate memory for the hash table */ +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for hash_table.\n", + sizeof(int)*ip_list_hash_size); +#endif + + curr_table->hash_table = vmalloc(sizeof(int)*ip_list_hash_size); + if(!curr_table->hash_table) { + printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for hash_table.\n"); + vfree(hold); + vfree(curr_table->table); + vfree(curr_table); + return 0; + } + + for(c = 0; c < ip_list_hash_size; c++) { + curr_table->hash_table[c] = -1; + } + + /* Allocate memory for the time info */ +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for time_info.\n", + sizeof(struct time_info_list)*ip_list_tot); +#endif + + curr_table->time_info = vmalloc(sizeof(struct time_info_list)*ip_list_tot); + if(!curr_table->time_info) { + printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for time_info.\n"); + vfree(curr_table->hash_table); + vfree(hold); + vfree(curr_table->table); + vfree(curr_table); + return 0; + } + for(c = 0; c < ip_list_tot; c++) { + curr_table->time_info[c].position = c; + curr_table->time_info[c].time = 0; + } + + /* Put the new table in place */ + spin_lock_bh(&recent_lock); + find_table = r_tables; + while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); + + /* If a table already exists just increment the count on that table and return */ + if(find_table) { + find_table->count++; + spin_unlock_bh(&recent_lock); +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), created by other process.\n",info->name); +#endif + vfree(curr_table->time_info); + vfree(curr_table->hash_table); + vfree(hold); + vfree(curr_table->table); + vfree(curr_table); + return 1; + } + if(!last_table) r_tables = curr_table; else last_table->next = curr_table; + + spin_unlock_bh(&recent_lock); + +#ifdef CONFIG_PROC_FS + /* Create our proc 'status' entry. */ + curr_table->status_proc = create_proc_entry(curr_table->name, ip_list_perms, proc_net_ipt_recent); + if (!curr_table->status_proc) { + printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for /proc entry.\n"); + /* Destroy the created table */ + spin_lock_bh(&recent_lock); + last_table = NULL; + curr_table = r_tables; + if(!curr_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, no tables.\n"); +#endif + spin_unlock_bh(&recent_lock); + return 0; + } + while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); + if(!curr_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, table already destroyed.\n"); +#endif + spin_unlock_bh(&recent_lock); + return 0; + } + if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; + spin_unlock_bh(&recent_lock); + vfree(curr_table->time_info); + vfree(curr_table->hash_table); + vfree(hold); + vfree(curr_table->table); + vfree(curr_table); + return 0; + } + + curr_table->status_proc->owner = THIS_MODULE; + curr_table->status_proc->data = curr_table; + wmb(); + curr_table->status_proc->read_proc = ip_recent_get_info; + curr_table->status_proc->write_proc = ip_recent_ctrl; +#endif /* CONFIG_PROC_FS */ + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() left.\n"); +#endif + + return 1; +} + +/* This function is called in the event that a rule matching this module is + * removed. + * When this happens we need to check if there are no other rules matching + * the table given. If that is the case then we remove the table and clean + * up its memory. + */ +static void +destroy(void *matchinfo, unsigned int matchsize) +{ + const struct ipt_recent_info *info = matchinfo; + struct recent_ip_tables *curr_table, *last_table; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() entered.\n"); +#endif + + if(matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return; + + /* Lock the linked list while we play with it */ + spin_lock_bh(&recent_lock); + + /* Look for an entry with this name already created */ + /* Finds the end of the list and the entry before the end if current name does not exist */ + last_table = NULL; + curr_table = r_tables; + if(!curr_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() No tables found, leaving.\n"); +#endif + spin_unlock_bh(&recent_lock); + return; + } + while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); + + /* If a table does not exist then do nothing and return */ + if(!curr_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table not found, leaving.\n"); +#endif + spin_unlock_bh(&recent_lock); + return; + } + + curr_table->count--; + + /* If count is still non-zero then there are still rules referenceing it so we do nothing */ + if(curr_table->count) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, non-zero count, leaving.\n"); +#endif + spin_unlock_bh(&recent_lock); + return; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, zero count, removing.\n"); +#endif + + /* Count must be zero so we remove this table from the list */ + if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; + + spin_unlock_bh(&recent_lock); + + /* lock to make sure any late-runners still using this after we removed it from + * the list finish up then remove everything */ + spin_lock_bh(&curr_table->list_lock); + spin_unlock_bh(&curr_table->list_lock); + +#ifdef CONFIG_PROC_FS + if(curr_table->status_proc) remove_proc_entry(curr_table->name,proc_net_ipt_recent); +#endif /* CONFIG_PROC_FS */ + vfree(curr_table->table[0].last_pkts); + vfree(curr_table->table); + vfree(curr_table->hash_table); + vfree(curr_table->time_info); + vfree(curr_table); + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() left.\n"); +#endif + + return; +} + +/* This is the structure we pass to ipt_register to register our + * module with iptables. + */ +static struct ipt_match recent_match = { + .name = "recent", + .match = &match, + .checkentry = &checkentry, + .destroy = &destroy, + .me = THIS_MODULE +}; + +/* Kernel module initialization. */ +static int __init init(void) +{ + int err, count; + + printk(version); +#ifdef CONFIG_PROC_FS + proc_net_ipt_recent = proc_mkdir("ipt_recent",proc_net); + if(!proc_net_ipt_recent) return -ENOMEM; +#endif + + if(ip_list_hash_size && ip_list_hash_size <= ip_list_tot) { + printk(KERN_WARNING RECENT_NAME ": ip_list_hash_size too small, resetting to default.\n"); + ip_list_hash_size = 0; + } + + if(!ip_list_hash_size) { + ip_list_hash_size = ip_list_tot*3; + count = 2*2; + while(ip_list_hash_size > count) count = count*2; + ip_list_hash_size = count; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": ip_list_hash_size: %d\n",ip_list_hash_size); +#endif + + err = ipt_register_match(&recent_match); + if (err) + remove_proc_entry("ipt_recent", proc_net); + return err; +} + +/* Kernel module destruction. */ +static void __exit fini(void) +{ + ipt_unregister_match(&recent_match); + + remove_proc_entry("ipt_recent",proc_net); +} + +/* Register our module with the kernel. */ +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_sctp.c b/net/ipv4/netfilter/ipt_sctp.c new file mode 100644 index 000000000000..fe2b327bcaa4 --- /dev/null +++ b/net/ipv4/netfilter/ipt_sctp.c @@ -0,0 +1,203 @@ +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/ip.h> +#include <linux/sctp.h> + +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_sctp.h> + +#ifdef DEBUG_SCTP +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#define SCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \ + || (!!((invflag) & (option)) ^ (cond))) + +static int +match_flags(const struct ipt_sctp_flag_info *flag_info, + const int flag_count, + u_int8_t chunktype, + u_int8_t chunkflags) +{ + int i; + + for (i = 0; i < flag_count; i++) { + if (flag_info[i].chunktype == chunktype) { + return (chunkflags & flag_info[i].flag_mask) == flag_info[i].flag; + } + } + + return 1; +} + +static int +match_packet(const struct sk_buff *skb, + const u_int32_t *chunkmap, + int chunk_match_type, + const struct ipt_sctp_flag_info *flag_info, + const int flag_count, + int *hotdrop) +{ + int offset; + u_int32_t chunkmapcopy[256 / sizeof (u_int32_t)]; + sctp_chunkhdr_t _sch, *sch; + +#ifdef DEBUG_SCTP + int i = 0; +#endif + + if (chunk_match_type == SCTP_CHUNK_MATCH_ALL) { + SCTP_CHUNKMAP_COPY(chunkmapcopy, chunkmap); + } + + offset = skb->nh.iph->ihl * 4 + sizeof (sctp_sctphdr_t); + do { + sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch); + if (sch == NULL) { + duprintf("Dropping invalid SCTP packet.\n"); + *hotdrop = 1; + return 0; + } + + duprintf("Chunk num: %d\toffset: %d\ttype: %d\tlength: %d\tflags: %x\n", + ++i, offset, sch->type, htons(sch->length), sch->flags); + + offset += (htons(sch->length) + 3) & ~3; + + duprintf("skb->len: %d\toffset: %d\n", skb->len, offset); + + if (SCTP_CHUNKMAP_IS_SET(chunkmap, sch->type)) { + switch (chunk_match_type) { + case SCTP_CHUNK_MATCH_ANY: + if (match_flags(flag_info, flag_count, + sch->type, sch->flags)) { + return 1; + } + break; + + case SCTP_CHUNK_MATCH_ALL: + if (match_flags(flag_info, flag_count, + sch->type, sch->flags)) { + SCTP_CHUNKMAP_CLEAR(chunkmapcopy, sch->type); + } + break; + + case SCTP_CHUNK_MATCH_ONLY: + if (!match_flags(flag_info, flag_count, + sch->type, sch->flags)) { + return 0; + } + break; + } + } else { + switch (chunk_match_type) { + case SCTP_CHUNK_MATCH_ONLY: + return 0; + } + } + } while (offset < skb->len); + + switch (chunk_match_type) { + case SCTP_CHUNK_MATCH_ALL: + return SCTP_CHUNKMAP_IS_CLEAR(chunkmap); + case SCTP_CHUNK_MATCH_ANY: + return 0; + case SCTP_CHUNK_MATCH_ONLY: + return 1; + } + + /* This will never be reached, but required to stop compiler whine */ + return 0; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_sctp_info *info; + sctp_sctphdr_t _sh, *sh; + + info = (const struct ipt_sctp_info *)matchinfo; + + if (offset) { + duprintf("Dropping non-first fragment.. FIXME\n"); + return 0; + } + + sh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_sh), &_sh); + if (sh == NULL) { + duprintf("Dropping evil TCP offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + duprintf("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest)); + + return SCCHECK(((ntohs(sh->source) >= info->spts[0]) + && (ntohs(sh->source) <= info->spts[1])), + IPT_SCTP_SRC_PORTS, info->flags, info->invflags) + && SCCHECK(((ntohs(sh->dest) >= info->dpts[0]) + && (ntohs(sh->dest) <= info->dpts[1])), + IPT_SCTP_DEST_PORTS, info->flags, info->invflags) + && SCCHECK(match_packet(skb, info->chunkmap, info->chunk_match_type, + info->flag_info, info->flag_count, + hotdrop), + IPT_SCTP_CHUNK_TYPES, info->flags, info->invflags); +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_sctp_info *info; + + info = (const struct ipt_sctp_info *)matchinfo; + + return ip->proto == IPPROTO_SCTP + && !(ip->invflags & IPT_INV_PROTO) + && matchsize == IPT_ALIGN(sizeof(struct ipt_sctp_info)) + && !(info->flags & ~IPT_SCTP_VALID_FLAGS) + && !(info->invflags & ~IPT_SCTP_VALID_FLAGS) + && !(info->invflags & ~info->flags) + && ((!(info->flags & IPT_SCTP_CHUNK_TYPES)) || + (info->chunk_match_type & + (SCTP_CHUNK_MATCH_ALL + | SCTP_CHUNK_MATCH_ANY + | SCTP_CHUNK_MATCH_ONLY))); +} + +static struct ipt_match sctp_match = +{ + .list = { NULL, NULL}, + .name = "sctp", + .match = &match, + .checkentry = &checkentry, + .destroy = NULL, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&sctp_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&sctp_match); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kiran Kumar Immidi"); +MODULE_DESCRIPTION("Match for SCTP protocol packets"); + diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c new file mode 100644 index 000000000000..b1511b97ea5f --- /dev/null +++ b/net/ipv4/netfilter/ipt_state.c @@ -0,0 +1,74 @@ +/* Kernel module to match connection tracking information. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/netfilter_ipv4/ip_conntrack.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netfilter_ipv4/ipt_state.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); +MODULE_DESCRIPTION("iptables connection tracking state match module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_state_info *sinfo = matchinfo; + enum ip_conntrack_info ctinfo; + unsigned int statebit; + + if (skb->nfct == &ip_conntrack_untracked.ct_general) + statebit = IPT_STATE_UNTRACKED; + else if (!ip_conntrack_get(skb, &ctinfo)) + statebit = IPT_STATE_INVALID; + else + statebit = IPT_STATE_BIT(ctinfo); + + return (sinfo->statemask & statebit); +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_state_info))) + return 0; + + return 1; +} + +static struct ipt_match state_match = { + .name = "state", + .match = &match, + .checkentry = &check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + need_ip_conntrack(); + return ipt_register_match(&state_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&state_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_tcpmss.c b/net/ipv4/netfilter/ipt_tcpmss.c new file mode 100644 index 000000000000..4dc9b16ab4a3 --- /dev/null +++ b/net/ipv4/netfilter/ipt_tcpmss.c @@ -0,0 +1,127 @@ +/* Kernel module to match TCP MSS values. */ + +/* Copyright (C) 2000 Marc Boucher <marc@mbsi.ca> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/tcp.h> + +#include <linux/netfilter_ipv4/ipt_tcpmss.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +#define TH_SYN 0x02 + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); +MODULE_DESCRIPTION("iptables TCP MSS match module"); + +/* Returns 1 if the mss option is set and matched by the range, 0 otherwise */ +static inline int +mssoption_match(u_int16_t min, u_int16_t max, + const struct sk_buff *skb, + int invert, + int *hotdrop) +{ + struct tcphdr _tcph, *th; + /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ + u8 _opt[15 * 4 - sizeof(_tcph)], *op; + unsigned int i, optlen; + + /* If we don't have the whole header, drop packet. */ + th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) + goto dropit; + + /* Malformed. */ + if (th->doff*4 < sizeof(*th)) + goto dropit; + + optlen = th->doff*4 - sizeof(*th); + if (!optlen) + goto out; + + /* Truncated options. */ + op = skb_header_pointer(skb, skb->nh.iph->ihl * 4 + sizeof(*th), + optlen, _opt); + if (op == NULL) + goto dropit; + + for (i = 0; i < optlen; ) { + if (op[i] == TCPOPT_MSS + && (optlen - i) >= TCPOLEN_MSS + && op[i+1] == TCPOLEN_MSS) { + u_int16_t mssval; + + mssval = (op[i+2] << 8) | op[i+3]; + + return (mssval >= min && mssval <= max) ^ invert; + } + if (op[i] < 2) i++; + else i += op[i+1]?:1; + } +out: + return invert; + + dropit: + *hotdrop = 1; + return 0; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_tcpmss_match_info *info = matchinfo; + + return mssoption_match(info->mss_min, info->mss_max, skb, + info->invert, hotdrop); +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_tcpmss_match_info))) + return 0; + + /* Must specify -p tcp */ + if (ip->proto != IPPROTO_TCP || (ip->invflags & IPT_INV_PROTO)) { + printk("tcpmss: Only works on TCP packets\n"); + return 0; + } + + return 1; +} + +static struct ipt_match tcpmss_match = { + .name = "tcpmss", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&tcpmss_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&tcpmss_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c new file mode 100644 index 000000000000..086a1bb61e3e --- /dev/null +++ b/net/ipv4/netfilter/ipt_tos.c @@ -0,0 +1,64 @@ +/* Kernel module to match TOS values. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv4/ipt_tos.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("iptables TOS match module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_tos_info *info = matchinfo; + + return (skb->nh.iph->tos == info->tos) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_tos_info))) + return 0; + + return 1; +} + +static struct ipt_match tos_match = { + .name = "tos", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&tos_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&tos_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c new file mode 100644 index 000000000000..219aa9de88cc --- /dev/null +++ b/net/ipv4/netfilter/ipt_ttl.c @@ -0,0 +1,79 @@ +/* IP tables module for matching the value of the TTL + * + * ipt_ttl.c,v 1.5 2000/11/13 11:16:08 laforge Exp + * + * (C) 2000,2001 by Harald Welte <laforge@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/skbuff.h> + +#include <linux/netfilter_ipv4/ipt_ttl.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); +MODULE_DESCRIPTION("IP tables TTL matching module"); +MODULE_LICENSE("GPL"); + +static int match(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *matchinfo, + int offset, int *hotdrop) +{ + const struct ipt_ttl_info *info = matchinfo; + + switch (info->mode) { + case IPT_TTL_EQ: + return (skb->nh.iph->ttl == info->ttl); + break; + case IPT_TTL_NE: + return (!(skb->nh.iph->ttl == info->ttl)); + break; + case IPT_TTL_LT: + return (skb->nh.iph->ttl < info->ttl); + break; + case IPT_TTL_GT: + return (skb->nh.iph->ttl > info->ttl); + break; + default: + printk(KERN_WARNING "ipt_ttl: unknown mode %d\n", + info->mode); + return 0; + } + + return 0; +} + +static int checkentry(const char *tablename, const struct ipt_ip *ip, + void *matchinfo, unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_ttl_info))) + return 0; + + return 1; +} + +static struct ipt_match ttl_match = { + .name = "ttl", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&ttl_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&ttl_match); + +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c new file mode 100644 index 000000000000..260a4f0a2a90 --- /dev/null +++ b/net/ipv4/netfilter/iptable_filter.c @@ -0,0 +1,194 @@ +/* + * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("iptables filter table"); + +#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT)) + +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[3]; + struct ipt_error term; +} initial_table __initdata += { { "filter", FILTER_VALID_HOOKS, 4, + sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), + { [NF_IP_LOCAL_IN] = 0, + [NF_IP_FORWARD] = sizeof(struct ipt_standard), + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 }, + { [NF_IP_LOCAL_IN] = 0, + [NF_IP_FORWARD] = sizeof(struct ipt_standard), + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 }, + 0, NULL, { } }, + { + /* LOCAL_IN */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* FORWARD */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_OUT */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } } + }, + /* ERROR */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_error), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } }, + { } }, + "ERROR" + } + } +}; + +static struct ipt_table packet_filter = { + .name = "filter", + .valid_hooks = FILTER_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ipt_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL); +} + +static unsigned int +ipt_local_out_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ipt_hook: happy cracking.\n"); + return NF_ACCEPT; + } + + return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL); +} + +static struct nf_hook_ops ipt_ops[] = { + { + .hook = ipt_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_FILTER, + }, + { + .hook = ipt_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_FORWARD, + .priority = NF_IP_PRI_FILTER, + }, + { + .hook = ipt_local_out_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_FILTER, + }, +}; + +/* Default to forward because I got too much mail already. */ +static int forward = NF_ACCEPT; +module_param(forward, bool, 0000); + +static int __init init(void) +{ + int ret; + + if (forward < 0 || forward > NF_MAX_VERDICT) { + printk("iptables forward must be 0 or 1\n"); + return -EINVAL; + } + + /* Entry 1 is the FORWARD hook */ + initial_table.entries[1].target.verdict = -forward - 1; + + /* Register table */ + ret = ipt_register_table(&packet_filter, &initial_table.repl); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ipt_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ipt_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + ret = nf_register_hook(&ipt_ops[2]); + if (ret < 0) + goto cleanup_hook1; + + return ret; + + cleanup_hook1: + nf_unregister_hook(&ipt_ops[1]); + cleanup_hook0: + nf_unregister_hook(&ipt_ops[0]); + cleanup_table: + ipt_unregister_table(&packet_filter); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ipt_ops[i]); + + ipt_unregister_table(&packet_filter); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c new file mode 100644 index 000000000000..160eb11b6e2f --- /dev/null +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -0,0 +1,260 @@ +/* + * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Extended to all five netfilter hooks by Brad Chapman & Harald Welte + */ +#include <linux/config.h> +#include <linux/module.h> +#include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <net/route.h> +#include <linux/ip.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); +MODULE_DESCRIPTION("iptables mangle table"); + +#define MANGLE_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | \ + (1 << NF_IP_LOCAL_IN) | \ + (1 << NF_IP_FORWARD) | \ + (1 << NF_IP_LOCAL_OUT) | \ + (1 << NF_IP_POST_ROUTING)) + +/* Ouch - five different hooks? Maybe this should be a config option..... -- BC */ +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[5]; + struct ipt_error term; +} initial_table __initdata += { { "mangle", MANGLE_VALID_HOOKS, 6, + sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error), + { [NF_IP_PRE_ROUTING] = 0, + [NF_IP_LOCAL_IN] = sizeof(struct ipt_standard), + [NF_IP_FORWARD] = sizeof(struct ipt_standard) * 2, + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 3, + [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 4 }, + { [NF_IP_PRE_ROUTING] = 0, + [NF_IP_LOCAL_IN] = sizeof(struct ipt_standard), + [NF_IP_FORWARD] = sizeof(struct ipt_standard) * 2, + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 3, + [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 4 }, + 0, NULL, { } }, + { + /* PRE_ROUTING */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_IN */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* FORWARD */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_OUT */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* POST_ROUTING */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + }, + /* ERROR */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_error), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } }, + { } }, + "ERROR" + } + } +}; + +static struct ipt_table packet_mangler = { + .name = "mangle", + .valid_hooks = MANGLE_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE, +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ipt_route_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL); +} + +static unsigned int +ipt_local_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int ret; + u_int8_t tos; + u_int32_t saddr, daddr; + unsigned long nfmark; + + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ipt_hook: happy cracking.\n"); + return NF_ACCEPT; + } + + /* Save things which could affect route */ + nfmark = (*pskb)->nfmark; + saddr = (*pskb)->nh.iph->saddr; + daddr = (*pskb)->nh.iph->daddr; + tos = (*pskb)->nh.iph->tos; + + ret = ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL); + /* Reroute for ANY change. */ + if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE + && ((*pskb)->nh.iph->saddr != saddr + || (*pskb)->nh.iph->daddr != daddr +#ifdef CONFIG_IP_ROUTE_FWMARK + || (*pskb)->nfmark != nfmark +#endif + || (*pskb)->nh.iph->tos != tos)) + return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP; + + return ret; +} + +static struct nf_hook_ops ipt_ops[] = { + { + .hook = ipt_route_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_MANGLE, + }, + { + .hook = ipt_route_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_MANGLE, + }, + { + .hook = ipt_route_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_FORWARD, + .priority = NF_IP_PRI_MANGLE, + }, + { + .hook = ipt_local_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_MANGLE, + }, + { + .hook = ipt_route_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_MANGLE, + }, +}; + +static int __init init(void) +{ + int ret; + + /* Register table */ + ret = ipt_register_table(&packet_mangler, &initial_table.repl); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ipt_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ipt_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + ret = nf_register_hook(&ipt_ops[2]); + if (ret < 0) + goto cleanup_hook1; + + ret = nf_register_hook(&ipt_ops[3]); + if (ret < 0) + goto cleanup_hook2; + + ret = nf_register_hook(&ipt_ops[4]); + if (ret < 0) + goto cleanup_hook3; + + return ret; + + cleanup_hook3: + nf_unregister_hook(&ipt_ops[3]); + cleanup_hook2: + nf_unregister_hook(&ipt_ops[2]); + cleanup_hook1: + nf_unregister_hook(&ipt_ops[1]); + cleanup_hook0: + nf_unregister_hook(&ipt_ops[0]); + cleanup_table: + ipt_unregister_table(&packet_mangler); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ipt_ops[i]); + + ipt_unregister_table(&packet_mangler); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c new file mode 100644 index 000000000000..01b4a3c814d3 --- /dev/null +++ b/net/ipv4/netfilter/iptable_raw.c @@ -0,0 +1,156 @@ +/* + * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT . + * + * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> + */ +#include <linux/module.h> +#include <linux/netfilter_ipv4/ip_tables.h> + +#define RAW_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT)) + +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[2]; + struct ipt_error term; +} initial_table __initdata = { + .repl = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .num_entries = 3, + .size = sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error), + .hook_entry = { + [NF_IP_PRE_ROUTING] = 0, + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) }, + .underflow = { + [NF_IP_PRE_ROUTING] = 0, + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) }, + }, + .entries = { + /* PRE_ROUTING */ + { + .entry = { + .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_standard), + }, + .target = { + .target = { + .u = { + .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), + }, + }, + .verdict = -NF_ACCEPT - 1, + }, + }, + + /* LOCAL_OUT */ + { + .entry = { + .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_standard), + }, + .target = { + .target = { + .u = { + .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), + }, + }, + .verdict = -NF_ACCEPT - 1, + }, + }, + }, + /* ERROR */ + .term = { + .entry = { + .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_error), + }, + .target = { + .target = { + .u = { + .user = { + .target_size = IPT_ALIGN(sizeof(struct ipt_error_target)), + .name = IPT_ERROR_TARGET, + }, + }, + }, + .errorname = "ERROR", + }, + } +}; + +static struct ipt_table packet_raw = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ipt_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(pskb, hook, in, out, &packet_raw, NULL); +} + +/* 'raw' is the very first table. */ +static struct nf_hook_ops ipt_ops[] = { + { + .hook = ipt_hook, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_RAW + }, + { + .hook = ipt_hook, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_RAW + }, +}; + +static int __init init(void) +{ + int ret; + + /* Register table */ + ret = ipt_register_table(&packet_raw, &initial_table.repl); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ipt_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ipt_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + return ret; + + cleanup_hook0: + nf_unregister_hook(&ipt_ops[0]); + cleanup_table: + ipt_unregister_table(&packet_raw); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ipt_ops[i]); + + ipt_unregister_table(&packet_raw); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); |