Linux TCP/IP 协议栈之 Socket 的实现分析(一)

内核版本:2.6.37
参考[作者:kendo的文章(基于内涵版本2.6.12)]

第一部份 Socket套接字的创建

socket 并不是 TCP/IP协议的一部份。
从广义上来讲,socket 是Unix/Linux 抽像的进程间通讯的一种方法。网络 socket 通讯仅仅是其若干协议中的一类。而tcp/ip 又是网络这类中的一种。
从tcp/ip 的解度看 socket ,它更多地体现了用户 API 与协议栈的一个中间层接口层。用户通过调用socket API 将报文递交给协议栈,或者从协议栈中接收报文件。

一、系统总入口
Linux 内核为所有的与socket 有关的操作的API,提供了一个统一的系统调用入口,其代码在net/socket.c 中:

/*
* System call vectors.
*
* Argument checking cleaned up. Saved 20% in size.
* This function doesn't need to set the kernel lock because
* it is set by the callees.
*/ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
{
unsigned long a[];
unsigned long a0, a1;
int err;
unsigned int len; if (call < || call > SYS_RECVMMSG)
return -EINVAL; len = nargs[call];
if (len > sizeof(a))
return -EINVAL; /* copy_from_user should be SMP safe. */
if (copy_from_user(a, args, len))
return -EFAULT; audit_socketcall(nargs[call] / sizeof(unsigned long), a); a0 = a[];
a1 = a[]; switch (call) {
case SYS_SOCKET:
err = sys_socket(a0, a1, a[]);
break;
case SYS_BIND:
err = sys_bind(a0, (struct sockaddr __user *)a1, a[]);
break;
case SYS_CONNECT:
err = sys_connect(a0, (struct sockaddr __user *)a1, a[]);
break;
case SYS_LISTEN:
err = sys_listen(a0, a1);
break;
case SYS_ACCEPT:
err = sys_accept4(a0, (struct sockaddr __user *)a1,
(int __user *)a[], );
break;
case SYS_GETSOCKNAME:
err =
sys_getsockname(a0, (struct sockaddr __user *)a1,
(int __user *)a[]);
break;
case SYS_GETPEERNAME:
err =
sys_getpeername(a0, (struct sockaddr __user *)a1,
(int __user *)a[]);
break;
case SYS_SOCKETPAIR:
err = sys_socketpair(a0, a1, a[], (int __user *)a[]);
break;
case SYS_SEND:
err = sys_send(a0, (void __user *)a1, a[], a[]);
break;
case SYS_SENDTO:
err = sys_sendto(a0, (void __user *)a1, a[], a[],
(struct sockaddr __user *)a[], a[]);
break;
case SYS_RECV:
err = sys_recv(a0, (void __user *)a1, a[], a[]);
break;
case SYS_RECVFROM:
err = sys_recvfrom(a0, (void __user *)a1, a[], a[],
(struct sockaddr __user *)a[],
(int __user *)a[]);
break;
case SYS_SHUTDOWN:
err = sys_shutdown(a0, a1);
break;
case SYS_SETSOCKOPT:
err = sys_setsockopt(a0, a1, a[], (char __user *)a[], a[]);
break;
case SYS_GETSOCKOPT:
err =
sys_getsockopt(a0, a1, a[], (char __user *)a[],
(int __user *)a[]);
break;
case SYS_SENDMSG:
err = sys_sendmsg(a0, (struct msghdr __user *)a1, a[]);
break;
case SYS_RECVMSG:
err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[]);
break;
case SYS_RECVMMSG:
err = sys_recvmmsg(a0, (struct mmsghdr __user *)a1, a[], a[],
(struct timespec __user *)a[]);
break;
case SYS_ACCEPT4:
err = sys_accept4(a0, (struct sockaddr __user *)a1,
(int __user *)a[], a[]);
break;
default:
err = -EINVAL;
break;
}
return err;
}

首先调用copy_from_user 将用户态参数拷贝至数组a 。但是问题在于,每个被调用的 API 的参数不尽相同,那么每次拷贝的字节在小如果断定?
来看其第三个参数nargs[call],其中 call 是操作码,后面有个大大的 switch...case就是判断它。对应的操作码定义在include/linux/net.h :

#define SYS_SOCKET    1        /* sys_socket(2)        */
#define SYS_BIND 2 /* sys_bind(2) */
#define SYS_CONNECT 3 /* sys_connect(2) */
#define SYS_LISTEN 4 /* sys_listen(2) */
#define SYS_ACCEPT 5 /* sys_accept(2) */
#define SYS_GETSOCKNAME 6 /* sys_getsockname(2) */
#define SYS_GETPEERNAME 7 /* sys_getpeername(2) */
#define SYS_SOCKETPAIR 8 /* sys_socketpair(2) */
#define SYS_SEND 9 /* sys_send(2) */
#define SYS_RECV 10 /* sys_recv(2) */
#define SYS_SENDTO 11 /* sys_sendto(2) */
#define SYS_RECVFROM 12 /* sys_recvfrom(2) */
#define SYS_SHUTDOWN 13 /* sys_shutdown(2) */
#define SYS_SETSOCKOPT 14 /* sys_setsockopt(2) */
#define SYS_GETSOCKOPT 15 /* sys_getsockopt(2) */
#define SYS_SENDMSG 16 /* sys_sendmsg(2) */
#define SYS_RECVMSG 17 /* sys_recvmsg(2) */
#define SYS_ACCEPT4 18 /* sys_accept4(2) */
#define SYS_RECVMMSG 19 /* sys_recvmmsg(2) */

而数组nargs则根据操作码的不同,计算对应的参数的空间大小:

/* Argument list sizes for sys_socketcall */
#define AL(x) ((x) * sizeof(unsigned long))
static const unsigned char nargs[] = {
AL(), AL(), AL(), AL(), AL(), AL(),
AL(), AL(), AL(), AL(), AL(), AL(),
AL(), AL(), AL(), AL(), AL(), AL(),
AL(), AL()
}; #undef AL

当拷贝完成参数后,就进入一个switch...case... 判断操作码,跳转至对应的系统接口。

二、 sys_socket 函数

当用户空间要创建一个socke 接口时,会调用 API 函数:

int socket(int domain, int type, int protocol);

函数,其三个参数分别表示协议族、协议类型(面向连接或无连接)以及协议
协议族:

/* Supported address families. */
#define AF_UNSPEC 0
#define AF_UNIX 1 /* Unix domain sockets */
#define AF_LOCAL 1 /* POSIX name for AF_UNIX */
#define AF_INET 2 /* Internet IP Protocol */
#define AF_AX25 3 /* Amateur Radio AX.25 */
#define AF_IPX 4 /* Novell IPX */
#define AF_APPLETALK 5 /* AppleTalk DDP */
#define AF_NETROM 6 /* Amateur Radio NET/ROM */
#define AF_BRIDGE 7 /* Multiprotocol bridge */
#define AF_ATMPVC 8 /* ATM PVCs */
#define AF_X25 9 /* Reserved for X.25 project */
#define AF_INET6 10 /* IP version 6 */
#define AF_ROSE 11 /* Amateur Radio X.25 PLP */
#define AF_DECnet 12 /* Reserved for DECnet project */
#define AF_NETBEUI 13 /* Reserved for 802.2LLC project*/
#define AF_SECURITY 14 /* Security callback pseudo AF */
#define AF_KEY 15 /* PF_KEY key management API */
#define AF_NETLINK 16
#define AF_ROUTE AF_NETLINK /* Alias to emulate 4.4BSD */
#define AF_PACKET 17 /* Packet family */
#define AF_ASH 18 /* Ash */
#define AF_ECONET 19 /* Acorn Econet */
#define AF_ATMSVC 20 /* ATM SVCs */
#define AF_RDS 21 /* RDS sockets */
#define AF_SNA 22 /* Linux SNA Project (nutters!) */
#define AF_IRDA 23 /* IRDA sockets */
#define AF_PPPOX 24 /* PPPoX sockets */
#define AF_WANPIPE 25 /* Wanpipe API Sockets */
#define AF_LLC 26 /* Linux LLC */
#define AF_CAN 29 /* Controller Area Network */
#define AF_TIPC 30 /* TIPC sockets */
#define AF_BLUETOOTH 31 /* Bluetooth sockets */
#define AF_IUCV 32 /* IUCV sockets */
#define AF_RXRPC 33 /* RxRPC sockets */
#define AF_ISDN 34 /* mISDN sockets */
#define AF_PHONET 35 /* Phonet sockets */
#define AF_IEEE802154 36 /* IEEE802154 sockets */
#define AF_CAIF 37 /* CAIF sockets */
#define AF_MAX 38 /* For now.. */ /* Protocol families, same as address families. */
#define PF_UNSPEC AF_UNSPEC
#define PF_UNIX AF_UNIX
#define PF_LOCAL AF_LOCAL
#define PF_INET AF_INET
#define PF_AX25 AF_AX25
#define PF_IPX AF_IPX
#define PF_APPLETALK AF_APPLETALK
#define PF_NETROM AF_NETROM
#define PF_BRIDGE AF_BRIDGE
#define PF_ATMPVC AF_ATMPVC
#define PF_X25 AF_X25
#define PF_INET6 AF_INET6
#define PF_ROSE AF_ROSE
#define PF_DECnet AF_DECnet
#define PF_NETBEUI AF_NETBEUI
#define PF_SECURITY AF_SECURITY
#define PF_KEY AF_KEY
#define PF_NETLINK AF_NETLINK
#define PF_ROUTE AF_ROUTE
#define PF_PACKET AF_PACKET
#define PF_ASH AF_ASH
#define PF_ECONET AF_ECONET
#define PF_ATMSVC AF_ATMSVC
#define PF_RDS AF_RDS
#define PF_SNA AF_SNA
#define PF_IRDA AF_IRDA
#define PF_PPPOX AF_PPPOX
#define PF_WANPIPE AF_WANPIPE
#define PF_LLC AF_LLC
#define PF_CAN AF_CAN
#define PF_TIPC AF_TIPC
#define PF_BLUETOOTH AF_BLUETOOTH
#define PF_IUCV AF_IUCV
#define PF_RXRPC AF_RXRPC
#define PF_ISDN AF_ISDN
#define PF_PHONET AF_PHONET
#define PF_IEEE802154 AF_IEEE802154
#define PF_CAIF AF_CAIF
#define PF_MAX AF_MAX

协议类型:

enum sock_type {
SOCK_STREAM = ,
SOCK_DGRAM = ,
SOCK_RAW = ,
SOCK_RDM = ,
SOCK_SEQPACKET = ,
SOCK_DCCP = ,
SOCK_PACKET = ,
};

socket创建通过操作码SYS_SOCKET是由sys_socket() 实现的:

SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
int retval;
struct socket *sock;
int flags; /* Check the SOCK_* constants for consistency. */
BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK); flags = type & ~SOCK_TYPE_MASK;
if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
return -EINVAL;
type &= SOCK_TYPE_MASK; if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; retval = sock_create(family, type, protocol, &sock);
if (retval < )
goto out; retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
if (retval < )
goto out_release; out:
/* It may be already another descriptor 8) Not kernel problem. */
return retval; out_release:
sock_release(sock);
return retval;
}

这段代码做了两件事:

1>  分配 sock 与sk,协议簇的协议封装;

2>  sock 面向上层系统调用,主要是与文件系统交互。

  通过进程的current指针的files,结合创建socket时返回的文件描符述,可以找到内核中对应的struct file,再根据file的f_dentry可以找到对应的目录项,而目录项struct dentry中,有d_inode指针,指向与sock封装在一起的inode。

  sock又与sk指针互指,一一对应。

三、 协议簇的协议封装

int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf; /*
* Check protocol is in range
*/
if (family < || family >= NPROTO)
return -EAFNOSUPPORT;
if (type < || type >= SOCK_MAX)
return -EINVAL; /* Compatibility. This uglymoron is moved from INET layer to here to avoid
deadlock in module load.
*/
if (family == PF_INET && type == SOCK_PACKET) {
static int warned;
if (!warned) {
warned = ;
printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n",
current->comm);
}
family = PF_PACKET;
} err = security_socket_create(family, type, protocol, kern);
if (err)
return err; /*
* Allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate
* default.
*/
sock = sock_alloc();
if (!sock) {
if (net_ratelimit())
printk(KERN_WARNING "socket: no more sockets\n");
return -ENFILE; /* Not exactly a match, but its the
closest posix thing */
} sock->type = type; #ifdef CONFIG_MODULES
/* Attempt to load a protocol module if the find failed.
*
* 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
* requested real, full-featured networking support upon configuration.
* Otherwise module support will break!
*/
if (net_families[family] == NULL)
request_module("net-pf-%d", family);
#endif rcu_read_lock();
pf = rcu_dereference(net_families[family]);
err = -EAFNOSUPPORT;
if (!pf)
goto out_release; /*
* We will call the ->create function, that possibly is in a loadable
* module, so we have to bump that loadable module refcnt first.
*/
if (!try_module_get(pf->owner))
goto out_release; /* Now protected by module ref count */
rcu_read_unlock(); err = pf->create(net, sock, protocol, kern);
if (err < )
goto out_module_put; /*
* Now to bump the refcnt of the [loadable] module that owns this
* socket at sock_release time we decrement its refcnt.
*/
if (!try_module_get(sock->ops->owner))
goto out_module_busy; /*
* Now that we're done with the ->create function, the [loadable]
* module can have its refcnt decremented
*/
module_put(pf->owner);
err = security_socket_post_create(sock, family, type, protocol, kern);
if (err)
goto out_sock_release;
*res = sock; return ; out_module_busy:
err = -EAFNOSUPPORT;
out_module_put:
sock->ops = NULL;
module_put(pf->owner);
out_sock_release:
sock_release(sock);
return err; out_release:
rcu_read_unlock();
goto out_sock_release;
}
EXPORT_SYMBOL(__sock_create);

上面这个函数主要做了三件事:

1> sock_alloc()

在分析这个函数前,首先要了解:为了对 socket 抽像出文件的概念,内核中为socket定义了一个专门的文件系统类型sockfs。

static struct vfsmount *sock_mnt __read_mostly;

static struct file_system_type sock_fs_type = {
.name = "sockfs",
.mount = sockfs_mount,
.kill_sb = kill_anon_super,
};

在模块初始化的时候,安装该文件系统:

static int __init sock_init(void)
{
/*
* Initialize sock SLAB cache.
*/ sk_init(); /*
* Initialize skbuff SLAB cache
*/
skb_init(); /*
* Initialize the protocols module.
*/ init_inodecache();
register_filesystem(&sock_fs_type);
sock_mnt = kern_mount(&sock_fs_type); /* The real protocol initialization is performed in later initcalls.
*/ #ifdef CONFIG_NETFILTER
netfilter_init();
#endif #ifdef CONFIG_NETWORK_PHY_TIMESTAMPING
skb_timestamping_init();
#endif return ;
} core_initcall(sock_init); /* early initcall */

文件系统安装中的一个重要步骤kern_mount->kern_mount_data->vfs_kern_mount:

vfs_kern_mount函数中,先根据注册的文件系统类型,如果文件系统本身有mount成员函数则调用之,没则调用它的get_sb成员函数指针,获取相应的超级块sb 。最后,调置文件系统的超级块成员指针,使之指向对应的值。

其中sockfs文件系统的mount函数调用mount_pseudo()实现超级块的初始化,跟节点inode和目录下dentry创建,sockfs_ops这里关联上文件系统。

那前面提到的new_inode()函数分配inode 时调用的: sock_mnt->mnt_sb->s_op->alloc_inode(sock_mnt->mnt_sb);

static const struct super_operations sockfs_ops = {
.alloc_inode = sock_alloc_inode,
.destroy_inode = sock_destroy_inode,
.statfs = simple_statfs,
};

这个alloc_inode函数指针也就是sockfs_opssock_alloc_inode()函数。

static struct inode *sock_alloc_inode(struct super_block *sb)
{
struct socket_alloc *ei; ei = kmem_cache_alloc(sock_inode_cachep, GFP_KERNEL);
if (!ei)
return NULL;
ei->socket.wq = kmalloc(sizeof(struct socket_wq), GFP_KERNEL);
if (!ei->socket.wq) {
kmem_cache_free(sock_inode_cachep, ei);
return NULL;
}
init_waitqueue_head(&ei->socket.wq->wait);
ei->socket.wq->fasync_list = NULL; ei->socket.state = SS_UNCONNECTED;
ei->socket.flags = ;
ei->socket.ops = NULL;
ei->socket.sk = NULL;
ei->socket.file = NULL; return &ei->vfs_inode;
}

函数先分配了一个用于封装socket和inode的ei ,然后在高速缓存中为之申请了一块空间。这样,inode和socket就同时都被分配了。接下来初始化socket的各个成员。

struct socket_alloc {
struct socket socket;
struct inode vfs_inode;
};

显而易见,该结构实现了inode和socket的封装。已经通过new_inode从sockfs文件系统分配一个inode,可以通过宏SOCKET_I来获取与之对应的socket:

sock = SOCKET_I(inode);

分配inode、socket 以及两者如何关联,都已一一分析了。

2> pf = rcu_dereference(net_families[family]);

net_families[family]的定义:

static const struct net_proto_family *net_families[NPROTO] __read_mostly;

net_proto_family的定义:

struct net_proto_family {
int family;
int (*create)(struct net *net, struct socket *sock,
int protocol, int kern);
struct module *owner;
};

net_families数组填充函数sock_register():

/**
* sock_register - add a socket protocol handler
* @ops: description of protocol
*
* This function is called by a protocol handler that wants to
* advertise its address family, and have it linked into the
* socket interface. The value ops->family coresponds to the
* socket system call protocol family.
*/
int sock_register(const struct net_proto_family *ops)
{
int err; if (ops->family >= NPROTO) {
printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family,
NPROTO);
return -ENOBUFS;
} spin_lock(&net_family_lock);
if (net_families[ops->family])
err = -EEXIST;
else {
net_families[ops->family] = ops;
err = ;
}
spin_unlock(&net_family_lock); printk(KERN_INFO "NET: Registered protocol family %d\n", ops->family);
return err;
}
EXPORT_SYMBOL(sock_register);

从这里我们看出每个协议族都是通过sock_register函数注册到net_families数组中,通过代码搜索发现每个协议族都会调用这个函数去注册。

Af_ax25.c (net\ax25):    sock_register(&ax25_family_ops);
Af_bluetooth.c (net\bluetooth): err = sock_register(&bt_sock_family_ops);
Af_can.c (net\can): sock_register(&can_family_ops);
Af_decnet.c (net\decnet): sock_register(&dn_family_ops);
Af_econet.c (net\econet): sock_register(&econet_family_ops);
Af_ieee802154.c (net\ieee802154): rc = sock_register(&ieee802154_family_ops);
Af_inet.c (net\ipv4): (void)sock_register(&inet_family_ops);
Af_inet6.c (net\ipv6): err = sock_register(&inet6_family_ops);
Af_ipx.c (net\ipx): sock_register(&ipx_family_ops);
Af_irda.c (net\irda): rc = sock_register(&irda_family_ops);
Af_iucv.c (net\iucv): err = sock_register(&iucv_sock_family_ops);
Af_key.c (net\key): err = sock_register(&pfkey_family_ops);
Af_llc.c (net\llc): rc = sock_register(&llc_ui_family_ops);
Af_netlink.c (net\netlink): sock_register(&netlink_family_ops);
Af_netrom.c (net\netrom): if (sock_register(&nr_family_ops)) {
Af_packet.c (net\packet): sock_register(&packet_family_ops);
Af_phonet.c (net\phonet): err = sock_register(&phonet_proto_family);
Af_rds.c (net\rds): ret = sock_register(&rds_family_ops);
Af_rose.c (net\rose): sock_register(&rose_family_ops);
Af_rxrpc.c (net\rxrpc): ret = sock_register(&rxrpc_family_ops);
Af_unix.c (net\unix): sock_register(&unix_family_ops);
Af_x25.c (net\x25): rc = sock_register(&x25_family_ops);
Caif_socket.c (net\caif): int err = sock_register(&caif_family_ops);
Ddp.c (net\appletalk): (void)sock_register(&atalk_family_ops);
Net.h (include\linux):extern int sock_register(const struct net_proto_family *fam);
Pppox.c (drivers\net): return sock_register(&pppox_proto_family);
Pvc.c (net\atm): return sock_register(&pvc_family_ops);
Socket.c (drivers\isdn\misdn): err = sock_register(&mISDN_sock_family_ops);
Socket.c (net): * sock_register - add a socket protocol handler
Socket.c (net):int sock_register(const struct net_proto_family *ops)
Socket.c (net):EXPORT_SYMBOL(sock_register);
Socket.c (net\tipc): res = sock_register(&tipc_family_ops);
Svc.c (net\atm): return sock_register(&svc_family_ops);

本文主要分析的ipv4协议族,所以我们参考的文件af_inet.c(net/ipv4)。

3> err = pf->create(net, sock, protocol, kern);

在af_inet.c里面inet_init函数里面调用sock_register注册到协议族数组net_families里:

(void)sock_register(&inet_family_ops);

接着看inet_family_ops定义:

static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
};

这里的inet_create就是程序调用的函数:

/*
* Create an inet socket.
*/ static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct sock *sk;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
char answer_no_check;
int try_loading_module = ;
int err; if (unlikely(!inet_ehash_secret))
if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
build_ehash_secret(); sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { err = ;
/* Check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
} else {
/* Check for the two wild cases. */
if (IPPROTO_IP == protocol) {
protocol = answer->protocol;
break;
}
if (IPPROTO_IP == answer->protocol)
break;
}
err = -EPROTONOSUPPORT;
} if (unlikely(err)) {
if (try_loading_module < ) {
rcu_read_unlock();
/*
* Be more specific, e.g. net-pf-2-proto-132-type-1
* (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
*/
if (++try_loading_module == )
request_module("net-pf-%d-proto-%d-type-%d",
PF_INET, protocol, sock->type);
/*
* Fall back to generic, e.g. net-pf-2-proto-132
* (net-pf-PF_INET-proto-IPPROTO_SCTP)
*/
else
request_module("net-pf-%d-proto-%d",
PF_INET, protocol);
goto lookup_protocol;
} else
goto out_rcu_unlock;
} err = -EPERM;
if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
goto out_rcu_unlock; err = -EAFNOSUPPORT;
if (!inet_netns_ok(net, protocol))
goto out_rcu_unlock; sock->ops = answer->ops;
answer_prot = answer->prot;
answer_no_check = answer->no_check;
answer_flags = answer->flags;
rcu_read_unlock(); WARN_ON(answer_prot->slab == NULL); err = -ENOBUFS;
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
if (sk == NULL)
goto out; err = ;
sk->sk_no_check = answer_no_check;
if (INET_PROTOSW_REUSE & answer_flags)
sk->sk_reuse = ; inet = inet_sk(sk);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != ; inet->nodefrag = ; if (SOCK_RAW == sock->type) {
inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
inet->hdrincl = ;
} if (ipv4_config.no_pmtu_disc)
inet->pmtudisc = IP_PMTUDISC_DONT;
else
inet->pmtudisc = IP_PMTUDISC_WANT; inet->inet_id = ; sock_init_data(sock, sk); sk->sk_destruct = inet_sock_destruct;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; inet->uc_ttl = -;
inet->mc_loop = ;
inet->mc_ttl = ;
inet->mc_all = ;
inet->mc_index = ;
inet->mc_list = NULL; sk_refcnt_debug_inc(sk); if (inet->inet_num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk);
} if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
if (err)
sk_common_release(sk);
}
out:
return err;
out_rcu_unlock:
rcu_read_unlock();
goto out;
}

在分析inet_create()函数前,就要分析inetsw[SOCK_MAX]这个数组。

static struct list_head inetsw[SOCK_MAX];

这个数组是在inet_init()->inet_register_protosw()里面填充的。

    for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
inet_register_protosw(q);

inetsw_array定义:

/* Upon startup we insert all the elements in inetsw_array[] into
* the linked list inetsw.
*/
static struct inet_protosw inetsw_array[] =
{
{
.type = SOCK_STREAM,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.no_check = ,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
}, {
.type = SOCK_DGRAM,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
}, {
.type = SOCK_RAW,
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
}; #define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array)

inet_register_protosw函数分析:

void inet_register_protosw(struct inet_protosw *p)
{
struct list_head *lh;
struct inet_protosw *answer;
int protocol = p->protocol;
struct list_head *last_perm; spin_lock_bh(&inetsw_lock); if (p->type >= SOCK_MAX)
goto out_illegal; /* If we are trying to override a permanent protocol, bail. */
answer = NULL;
last_perm = &inetsw[p->type];
list_for_each(lh, &inetsw[p->type]) {
answer = list_entry(lh, struct inet_protosw, list); /* Check only the non-wild match. */
if (INET_PROTOSW_PERMANENT & answer->flags) {
if (protocol == answer->protocol)
break;
last_perm = lh;
} answer = NULL;
}
if (answer)
goto out_permanent; /* Add the new entry after the last permanent entry if any, so that
* the new entry does not override a permanent entry when matched with
* a wild-card protocol. But it is allowed to override any existing
* non-permanent entry. This means that when we remove this entry, the
* system automatically returns to the old behavior.
*/
list_add_rcu(&p->list, last_perm);
out:
spin_unlock_bh(&inetsw_lock); return; out_permanent:
printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
protocol);
goto out; out_illegal:
printk(KERN_ERR
"Ignoring attempt to register invalid socket type %d.\n",
p->type);
goto out;
}
EXPORT_SYMBOL(inet_register_protosw);

这个函数完成的工作,就是把inetsw_array 数组中,相同的协议类型(protocol成员)下边的协议,加入到inetsw 对应的协议类型的链表中去。
因为事实上一对一的关系,所以这个函数要简单得多:
  因为不存在其它成员,所以每一次 list_entry 都为空值,所以不存在覆盖和追加的情况,直接调用list_add_rcu(&p->list, last_perm);
  把协议类型节点(struct inet_protosw 类型的数组的某个元素)添加到链表(链表首部本身是一个数组,数组索引是协议对应的协议类型的值的第一个成员。

继续分析inet_create()函数:

  首先,根据sock的成员protocol,把之前在链表中注册的协议节点找出。

  然后,将创建的socket 的ops 函数指针集,指向协议类型的例如创建的是SOCK_STREAM,那么就指向了inet_stream_ops; answer_prot 指针指向了当前要创建的socket 的协议类型下边的协议,如上例,它就是IPPROTO_TCP 的tcp_prot结构。

  接着, 接下来一个重要的工作,就是为socket分配一个sock,并初始化它。

  最后,初始化一个 inet 。

虽然create 的代码就到这儿了,不过要说清楚sk(socK)的分配,还得费上大力气。
每一个Socket 套接字,都有一个对应的 struct socket 结构来描述(内核中一般使用名称为sock),但是同时又一个struct sock 结构(内核中一般使用名称为sk),两者之间是一一对应的关系。

在后面的sock_init_data 函数中,可以看到:

sk->sk_socket = sock;
sock->sk = sk;

socket 结构和 sock 结构实际上是同一个事物的两个方面。不妨说,socket 结构是面向进程和系统调用界面的侧面,而 sock 结构则是面向底层驱动程序的侧面。

设计者把socket套接字中,与文件系统关系比较密切的那一部份放在socket结构中,而把与通信关系比较密切的那一部份,则单独成为 一个数结结构,那就是sock 结构。

由于这两部份逻辑上本来就是一体的,所以要通过指针互相指向对方,形成一对一的关系。

调用sk_alloc()分配一个sk:

  在之前proto_register()函数创建的高速缓存中申请分配一个slab缓存项,并清零。然后设置协议族、并把sk中的sk_prot与对应的协议关联起来。

分配完成sk后,另一个重要的功能就是初始化它

  sk的成员相当复杂,其主要的初始化工作是在函数sock_init_data()中完成的:
  sock 结构中,有三个重要的双向队列,分别是 sk_receive_queuesk_write_queuesk_error_queue。从它们的名字就可以看出来其作用了。
队列并非采用通用的list_head来维护,而是使用skb_buffer队列:

struct sk_buff_head {
/* These two members must be first. */
struct sk_buff *next;
struct sk_buff *prev; __u32 qlen;
spinlock_t lock;
};

这样,队列中指向的每一个skb_buffer,就是一个数据包,分别是接收、发送和投递错误。
inet 初始化:
inet 是一个struct inet_sock 结构类型,来看它的定义:

struct inet_sock {
/* sk and pinet6 has to be the first two members of inet_sock */
struct sock sk;
……
}

只留意它的第一个成员就足够了。
我们说sock 是面向用户态调用,而sk是面向内核驱动调用的,那sk是如何与协议栈交互的呢?
对于每一个类型的协议,为了与sk联系起来,都定义了一个struct XXX_sock 结构,XXX是协议名,例如:

struct tcp_sock {
/* inet_sock has to be the first member of tcp_sock */
struct inet_sock inet;
int tcp_header_len; /* Bytes of tcp header to send */
……
}

很明显,它们的结构定构是“af_inet 一般属性+ 自己的私有属性” ,因为它们的第一个成员总是inet 。

现在回头来照一下起初在af_inet.c中,封装协议注册proto_register()的时候,size成员,对于tcp而言:

struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
...
.obj_size = sizeof(struct tcp_sock),
...
};

其它协议类似。

以obj_size 来确定每个 slab 缓存项分配的大小,所以,我们就可说,每次申请分配的,实际上是一个struct XXX_sock 结构大小的结构。因为都是定义于上层结构的第一个成员,可以使用强制类型转换来使用这块分配的内存空间。例如:

struct inet_sock {
/* sk and pinet6 has to be the first two members of inet_sock */
struct sock sk;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
struct ipv6_pinfo *pinet6;
#endif
/* Socket demultiplex comparisons on incoming packets. */
__be32 inet_daddr;
__be32 inet_rcv_saddr;
__be16 inet_dport;
__u16 inet_num;
__be32 inet_saddr;
__s16 uc_ttl;
__u16 cmsg_flags;
__be16 inet_sport;
__u16 inet_id;
...
}; inet = inet_sk(sk);
static inline struct inet_sock *inet_sk(const struct sock *sk)
{
  return (struct inet_sock *)sk; //inet_sock->sk
}
struct tcp_sock {
/* inet_connection_sock has to be the first member of tcp_sock */
struct inet_connection_sock inet_conn;
u16 tcp_header_len; /* Bytes of tcp header to send */
...
}; struct tcp_sock *tp = tcp_sk(sk);
static inline struct tcp_sock *tcp_sk(const struct sock *sk)
{
  return (struct tcp_sock *)sk; //tcp_sock->inet_conn->icsk_inet->sk
}

inet_create()运行完,一个 socket 套接字基本上就创建完毕了,剩下的就是与文件系统挂钩。

四、与文件系统交互

sys_socket()函数中来,它在调用完sock_create()后,紧接着调用sock_map_fd()函数:

int sock_map_fd(struct socket *sock, int flags)
{
struct file *newfile;
int fd = sock_alloc_file(sock, &newfile, flags); if (likely(fd >= ))
fd_install(fd, newfile); return fd;
}
EXPORT_SYMBOL(sock_map_fd);

这个函数的核心思想,在一开始,就已经分析过了。
从进程的角度来讲,一个 socket 套接字就是一个特殊的,已打开的文件。
前面分配好一个socket后,这里要做的就是将它与文件系统拉上亲戚关系。
首先获取一个空闲的文件描述符号和file结构。然后在文件系统中分配一个目录项(d_alloc),使其指向已经分配的inode节点(d_add),然后把其目录项挂在sockfs文件系统的根目录之下。
并且把目录项的指针d_op设置成指向 sockfs_dentry_operati,这个数据结构通过函数指针提供他与文件路径有关的操作:

static const struct dentry_operations sockfs_dentry_operations = {
.d_dname = sockfs_dname,
};

最后一步,就是将file结构中的f_op和sock结构中的i_fop都指向socket_file_ops,它是一个函数指针集,指向了socket面向文件系统的用户态调用的一些接口函数:

/*
* Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
* in the operation structures but are done directly via the socketcall() multiplexor.
*/ static const struct file_operations socket_file_ops = {
.owner = THIS_MODULE,
.llseek = no_llseek,
.aio_read = sock_aio_read,
.aio_write = sock_aio_write,
.poll = sock_poll,
.unlocked_ioctl = sock_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_sock_ioctl,
#endif
.mmap = sock_mmap,
.open = sock_no_open, /* special open code to disallow open via /proc */
.release = sock_close,
.fasync = sock_fasync,
.sendpage = sock_sendpage,
.splice_write = generic_splice_sendpage,
.splice_read = sock_splice_read,
};

到这里,整个socket 套接字的创建工作,就宣告完成了。

上一篇:HttpClient4登陆有验证码的网站


下一篇:VMware ESXI5.5 Memories limits resolved soluation.