进程间通信的内核实现

  Linux 内核提供了多种通用进程间通信的机制以适应不同的应用场景,比如FIFO,PIPE,signal,socket,share memory。为了实现进程间通信,应用程序必须陷入内核态以便于交换数据。

Linux提供的进程间通信接口可以满足基本的编程需求,但是为了克服某些缺点,达到应用程序或者OS的特有的需求,可以在内核态实现自定义的进程间通信方式,比如kdbus/binder等。

  以socket和binder为例,看看通用的进程间通信的实现机制以及如何自己实现一个进程间通信的接口。

典型的socket通信图:

进程间通信的流程和网络通信基本一致。下面是一段比较常见的进程间通信代码:

Server:

 1 #include <stdio.h>
 2 
 3 #include <sys/types.h>  
 4 #include <sys/socket.h>
 5 #include <sys/un.h>
 6 
 7 #include <errno.h>
 8 #include <string.h>
 9 
10 #include <unistd.h>
11 
12 #define MY_SOCK_ADDR "/tmp/my_socket"
13 
14 int main()
15 {
16     int fd;
17     int ret = 0;
18     int new_fd;
19 
20     struct sockaddr_un my_addr, client_addr;
21     socklen_t len;
22 
23     char buffer[1024];
24     int size;
25 
26     /* create a socket for inter-process communication */
27     fd = socket(AF_UNIX, SOCK_STREAM, 0);
28     if (fd < 0) {
29         fprintf(stderr, "create socket error(%s)\n", strerror(errno));
30         return -1;
31     }
32     
33     /* initialize socket address */
34     memset(&my_addr, 0, sizeof(struct sockaddr_un));
35     my_addr.sun_family = AF_UNIX;
36     strcpy(my_addr.sun_path, MY_SOCK_ADDR);
37 
38     /* bind address */
39     unlink(MY_SOCK_ADDR);
40     ret = bind(fd, (const struct sockaddr*)&my_addr, sizeof(my_addr));
41     if (ret < 0) {
42         fprintf(stderr, "bind address failed(%s)\n", strerror(errno));
43         goto exit;
44     }
45 
46     /* listen */
47     ret = listen(fd, -1);
48     if (ret < 0) {
49         fprintf(stderr, "listen failed(%s)\n", strerror(errno));
50         goto exit;
51     }
52 
53     /* accept incoming connections */
54     new_fd = accept(fd, (struct sockaddr*)&client_addr, &len);
55     if (new_fd < 0) {
56         fprintf(stderr, "accept error.(%s)\n", strerror(errno));
57         ret = -1;
58         goto exit;
59     }
60 
61     size = read(new_fd, buffer, sizeof(buffer));
62     if (size < 0) {
63         fprintf(stderr, "read error(%s)\n", strerror(errno));
64         ret = -1;
65         close(new_fd);
66         goto exit;
67     }
68 
69     printf("received \"%s\" from client\n", buffer);
70     close(new_fd);
71 
72 exit:
73     close(fd);
74     unlink(MY_SOCK_ADDR);
75 
76     return ret;
77 }

 Client:

 1 #include <stdio.h>
 2 
 3 #include <sys/types.h>  
 4 #include <sys/socket.h>
 5 #include <sys/un.h>
 6 
 7 #include <errno.h>
 8 #include <string.h>
 9 
10 #include <unistd.h>
11 
12 #define SERVER_SOCK_ADDR "/tmp/my_socket"
13 
14 int main()
15 {
16     int fd;
17     int ret = 0;
18 
19     struct sockaddr_un server_addr;
20     char buffer[1024];
21     int len;
22 
23     /* create a socket for inter-process communication */
24     fd = socket(AF_UNIX, SOCK_STREAM, 0);
25     if (fd < 0) {
26         fprintf(stderr, "create socket error(%s)\n", strerror(errno));
27         return -1;
28     }
29     
30     /* initialize socket address */
31     memset(&server_addr, 0, sizeof(struct sockaddr_un));
32     server_addr.sun_family = AF_UNIX;
33     strcpy(server_addr.sun_path, SERVER_SOCK_ADDR);
34 
35     /* connect to server */
36     ret = connect(fd, (struct sockaddr *)&server_addr, sizeof(server_addr));
37     if (ret < 0) {
38         fprintf(stderr, "connect to server failed(%s)\n", strerror(errno));
39         close(fd);
40         return -1;
41     }
42 
43     strcpy(buffer, "Hello server");
44 
45     len = write(fd, buffer, strlen(buffer) + 1);
46     if (len < 0) {
47         fprintf(stderr, "write to server failed(%s)\n", strerror(errno));
48         close(fd);
49         return -1;
50     }
51     
52     close(fd);
53 
54     return 0;
55 }

 

socket的用户态的实现位于libc.so里面,基本上是一个系统调用,陷入内核态。我们看socket在内核态的实现。

socket和文件系统之间的关系可以参考下面的链接。

http://linuxeco.com/?p=1

 

 1 /**
 2  *  struct socket - general BSD socket
 3  *  @state: socket state (%SS_CONNECTED, etc)
 4  *  @type: socket type (%SOCK_STREAM, etc)
 5  *  @flags: socket flags (%SOCK_ASYNC_NOSPACE, etc)
 6  *  @ops: protocol specific socket operations
 7  *  @file: File back pointer for gc
 8  *  @sk: internal networking protocol agnostic socket representation
 9  *  @wq: wait queue for several uses
10  */
11 struct socket {
12     socket_state        state;
13 
14     kmemcheck_bitfield_begin(type);
15     short            type;
16     kmemcheck_bitfield_end(type);
17 
18     unsigned long        flags;
19 
20     struct socket_wq __rcu    *wq;
21 
22     struct file        *file;
23     struct sock        *sk;
24     const struct proto_ops    *ops;
25 };
  1 /**
  2   *    struct sock - network layer representation of sockets
  3   *    @__sk_common: shared layout with inet_timewait_sock
  4   *    @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
  5   *    @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
  6   *    @sk_lock:    synchronizer
  7   *    @sk_rcvbuf: size of receive buffer in bytes
  8   *    @sk_wq: sock wait queue and async head
  9   *    @sk_rx_dst: receive input route used by early demux
 10   *    @sk_dst_cache: destination cache
 11   *    @sk_dst_lock: destination cache lock
 12   *    @sk_policy: flow policy
 13   *    @sk_receive_queue: incoming packets
 14   *    @sk_wmem_alloc: transmit queue bytes committed
 15   *    @sk_write_queue: Packet sending queue
 16   *    @sk_async_wait_queue: DMA copied packets
 17   *    @sk_omem_alloc: "o" is "option" or "other"
 18   *    @sk_wmem_queued: persistent queue size
 19   *    @sk_forward_alloc: space allocated forward
 20   *    @sk_napi_id: id of the last napi context to receive data for sk
 21   *    @sk_ll_usec: usecs to busypoll when there is no data
 22   *    @sk_allocation: allocation mode
 23   *    @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
 24   *    @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
 25   *    @sk_sndbuf: size of send buffer in bytes
 26   *    @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
 27   *           %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
 28   *    @sk_no_check: %SO_NO_CHECK setting, whether or not checkup packets
 29   *    @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
 30   *    @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK)
 31   *    @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4)
 32   *    @sk_gso_max_size: Maximum GSO segment size to build
 33   *    @sk_gso_max_segs: Maximum number of GSO segments
 34   *    @sk_lingertime: %SO_LINGER l_linger setting
 35   *    @sk_backlog: always used with the per-socket spinlock held
 36   *    @sk_callback_lock: used with the callbacks in the end of this struct
 37   *    @sk_error_queue: rarely used
 38   *    @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt,
 39   *              IPV6_ADDRFORM for instance)
 40   *    @sk_err: last error
 41   *    @sk_err_soft: errors that don't cause failure but are the cause of a
 42   *              persistent failure not just 'timed out'
 43   *    @sk_drops: raw/udp drops counter
 44   *    @sk_ack_backlog: current listen backlog
 45   *    @sk_max_ack_backlog: listen backlog set in listen()
 46   *    @sk_priority: %SO_PRIORITY setting
 47   *    @sk_cgrp_prioidx: socket group's priority map index
 48   *    @sk_type: socket type (%SOCK_STREAM, etc)
 49   *    @sk_protocol: which protocol this socket belongs in this network family
 50   *    @sk_peer_pid: &struct pid for this socket's peer
 51   *    @sk_peer_cred: %SO_PEERCRED setting
 52   *    @sk_rcvlowat: %SO_RCVLOWAT setting
 53   *    @sk_rcvtimeo: %SO_RCVTIMEO setting
 54   *    @sk_sndtimeo: %SO_SNDTIMEO setting
 55   *    @sk_rxhash: flow hash received from netif layer
 56   *    @sk_filter: socket filtering instructions
 57   *    @sk_protinfo: private area, net family specific, when not using slab
 58   *    @sk_timer: sock cleanup timer
 59   *    @sk_stamp: time stamp of last packet received
 60   *    @sk_socket: Identd and reporting IO signals
 61   *    @sk_user_data: RPC layer private data
 62   *    @sk_frag: cached page frag
 63   *    @sk_peek_off: current peek_offset value
 64   *    @sk_send_head: front of stuff to transmit
 65   *    @sk_security: used by security modules
 66   *    @sk_mark: generic packet mark
 67   *    @sk_classid: this socket's cgroup classid
 68   *    @sk_cgrp: this socket's cgroup-specific proto data
 69   *    @sk_write_pending: a write to stream socket waits to start
 70   *    @sk_state_change: callback to indicate change in the state of the sock
 71   *    @sk_data_ready: callback to indicate there is data to be processed
 72   *    @sk_write_space: callback to indicate there is bf sending space available
 73   *    @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE)
 74   *    @sk_backlog_rcv: callback to process the backlog
 75   *    @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0
 76  */
 77 struct sock {
 78     /*
 79      * Now struct inet_timewait_sock also uses sock_common, so please just
 80      * don't add nothing before this first member (__sk_common) --acme
 81      */
 82     struct sock_common    __sk_common;
 83 #define sk_node            __sk_common.skc_node
 84 #define sk_nulls_node        __sk_common.skc_nulls_node
 85 #define sk_refcnt        __sk_common.skc_refcnt
 86 #define sk_tx_queue_mapping    __sk_common.skc_tx_queue_mapping
 87 
 88 #define sk_dontcopy_begin    __sk_common.skc_dontcopy_begin
 89 #define sk_dontcopy_end        __sk_common.skc_dontcopy_end
 90 #define sk_hash            __sk_common.skc_hash
 91 #define sk_portpair        __sk_common.skc_portpair
 92 #define sk_num            __sk_common.skc_num
 93 #define sk_dport        __sk_common.skc_dport
 94 #define sk_addrpair        __sk_common.skc_addrpair
 95 #define sk_daddr        __sk_common.skc_daddr
 96 #define sk_rcv_saddr        __sk_common.skc_rcv_saddr
 97 #define sk_family        __sk_common.skc_family
 98 #define sk_state        __sk_common.skc_state
 99 #define sk_reuse        __sk_common.skc_reuse
100 #define sk_reuseport        __sk_common.skc_reuseport
101 #define sk_bound_dev_if        __sk_common.skc_bound_dev_if
102 #define sk_bind_node        __sk_common.skc_bind_node
103 #define sk_prot            __sk_common.skc_prot
104 #define sk_net            __sk_common.skc_net
105 #define sk_v6_daddr        __sk_common.skc_v6_daddr
106 #define sk_v6_rcv_saddr    __sk_common.skc_v6_rcv_saddr
107 
108     socket_lock_t        sk_lock;
109     struct sk_buff_head    sk_receive_queue;
110     /*
111      * The backlog queue is special, it is always used with
112      * the per-socket spinlock held and requires low latency
113      * access. Therefore we special case it's implementation.
114      * Note : rmem_alloc is in this structure to fill a hole
115      * on 64bit arches, not because its logically part of
116      * backlog.
117      */
118     struct {
119         atomic_t    rmem_alloc;
120         int        len;
121         struct sk_buff    *head;
122         struct sk_buff    *tail;
123     } sk_backlog;
124 #define sk_rmem_alloc sk_backlog.rmem_alloc
125     int            sk_forward_alloc;
126 #ifdef CONFIG_RPS
127     __u32            sk_rxhash;
128 #endif
129 #ifdef CONFIG_NET_RX_BUSY_POLL
130     unsigned int        sk_napi_id;
131     unsigned int        sk_ll_usec;
132 #endif
133     atomic_t        sk_drops;
134     int            sk_rcvbuf;
135 
136     struct sk_filter __rcu    *sk_filter;
137     struct socket_wq __rcu    *sk_wq;
138 
139 #ifdef CONFIG_NET_DMA
140     struct sk_buff_head    sk_async_wait_queue;
141 #endif
142 
143 #ifdef CONFIG_XFRM
144     struct xfrm_policy    *sk_policy[2];
145 #endif
146     unsigned long         sk_flags;
147     struct dst_entry    *sk_rx_dst;
148     struct dst_entry __rcu    *sk_dst_cache;
149     spinlock_t        sk_dst_lock;
150     atomic_t        sk_wmem_alloc;
151     atomic_t        sk_omem_alloc;
152     int            sk_sndbuf;
153     struct sk_buff_head    sk_write_queue;
154     kmemcheck_bitfield_begin(flags);
155     unsigned int        sk_shutdown  : 2,
156                 sk_no_check  : 2,
157                 sk_userlocks : 4,
158                 sk_protocol  : 8,
159 #define SK_PROTOCOL_MAX U8_MAX
160                 sk_type      : 16;
161     kmemcheck_bitfield_end(flags);
162     int            sk_wmem_queued;
163     gfp_t            sk_allocation;
164     u32            sk_pacing_rate; /* bytes per second */
165     u32            sk_max_pacing_rate;
166     netdev_features_t    sk_route_caps;
167     netdev_features_t    sk_route_nocaps;
168     int            sk_gso_type;
169     unsigned int        sk_gso_max_size;
170     u16            sk_gso_max_segs;
171     int            sk_rcvlowat;
172     unsigned long            sk_lingertime;
173     struct sk_buff_head    sk_error_queue;
174     struct proto        *sk_prot_creator;
175     rwlock_t        sk_callback_lock;
176     int            sk_err,
177                 sk_err_soft;
178     unsigned short        sk_ack_backlog;
179     unsigned short        sk_max_ack_backlog;
180     __u32            sk_priority;
181 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
182     __u32            sk_cgrp_prioidx;
183 #endif
184     struct pid        *sk_peer_pid;
185     const struct cred    *sk_peer_cred;
186     long            sk_rcvtimeo;
187     long            sk_sndtimeo;
188     void            *sk_protinfo;
189     struct timer_list    sk_timer;
190     ktime_t            sk_stamp;
191     struct socket        *sk_socket;
192     void            *sk_user_data;
193     struct page_frag    sk_frag;
194     struct sk_buff        *sk_send_head;
195     __s32            sk_peek_off;
196     int            sk_write_pending;
197 #ifdef CONFIG_SECURITY
198     void            *sk_security;
199 #endif
200     __u32            sk_mark;
201     u32            sk_classid;
202     struct cg_proto        *sk_cgrp;
203     void            (*sk_state_change)(struct sock *sk);
204     void            (*sk_data_ready)(struct sock *sk, int bytes);
205     void            (*sk_write_space)(struct sock *sk);
206     void            (*sk_error_report)(struct sock *sk);
207     int            (*sk_backlog_rcv)(struct sock *sk,
208                           struct sk_buff *skb);
209     void                    (*sk_destruct)(struct sock *sk);
210 };
View Code

 /* The AF_UNIX socket */

 1 /* The AF_UNIX socket */
 2 struct unix_sock {
 3     /* WARNING: sk has to be the first member */
 4     struct sock        sk;
 5     struct unix_address     *addr;
 6     struct path        path;
 7     struct mutex        readlock;
 8     struct sock        *peer;
 9     struct list_head    link;
10     atomic_long_t        inflight;
11     spinlock_t        lock;
12     unsigned char        recursion_level;
13     unsigned long        gc_flags;
14 #define UNIX_GC_CANDIDATE    0
15 #define UNIX_GC_MAYBE_CYCLE    1
16     struct socket_wq    peer_wq;
17     wait_queue_t        peer_wake;
18 };
View Code

 

  1 /**
  2  *    struct sk_buff - socket buffer
  3  *    @next: Next buffer in list
  4  *    @prev: Previous buffer in list
  5  *    @tstamp: Time we arrived
  6  *    @sk: Socket we are owned by
  7  *    @dev: Device we arrived on/are leaving by
  8  *    @cb: Control buffer. Free for use by every layer. Put private vars here
  9  *    @_skb_refdst: destination entry (with norefcount bit)
 10  *    @sp: the security path, used for xfrm
 11  *    @len: Length of actual data
 12  *    @data_len: Data length
 13  *    @mac_len: Length of link layer header
 14  *    @hdr_len: writable header length of cloned skb
 15  *    @csum: Checksum (must include start/offset pair)
 16  *    @csum_start: Offset from skb->head where checksumming should start
 17  *    @csum_offset: Offset from csum_start where checksum should be stored
 18  *    @priority: Packet queueing priority
 19  *    @local_df: allow local fragmentation
 20  *    @cloned: Head may be cloned (check refcnt to be sure)
 21  *    @ip_summed: Driver fed us an IP checksum
 22  *    @nohdr: Payload reference only, must not modify header
 23  *    @nfctinfo: Relationship of this skb to the connection
 24  *    @pkt_type: Packet class
 25  *    @fclone: skbuff clone status
 26  *    @ipvs_property: skbuff is owned by ipvs
 27  *    @peeked: this packet has been seen already, so stats have been
 28  *        done for it, don't do them again
 29  *    @nf_trace: netfilter packet trace flag
 30  *    @protocol: Packet protocol from driver
 31  *    @destructor: Destruct function
 32  *    @nfct: Associated connection, if any
 33  *    @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
 34  *    @skb_iif: ifindex of device we arrived on
 35  *    @tc_index: Traffic control index
 36  *    @tc_verd: traffic control verdict
 37  *    @rxhash: the packet hash computed on receive
 38  *    @queue_mapping: Queue mapping for multiqueue devices
 39  *    @ndisc_nodetype: router type (from link layer)
 40  *    @ooo_okay: allow the mapping of a socket to a queue to be changed
 41  *    @l4_rxhash: indicate rxhash is a canonical 4-tuple hash over transport
 42  *        ports.
 43  *    @wifi_acked_valid: wifi_acked was set
 44  *    @wifi_acked: whether frame was acked on wifi or not
 45  *    @no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
 46  *    @dma_cookie: a cookie to one of several possible DMA operations
 47  *        done by skb DMA functions
 48   *    @napi_id: id of the NAPI struct this skb came from
 49  *    @secmark: security marking
 50  *    @mark: Generic packet mark
 51  *    @dropcount: total number of sk_receive_queue overflows
 52  *    @vlan_proto: vlan encapsulation protocol
 53  *    @vlan_tci: vlan tag control information
 54  *    @inner_protocol: Protocol (encapsulation)
 55  *    @inner_transport_header: Inner transport layer header (encapsulation)
 56  *    @inner_network_header: Network layer header (encapsulation)
 57  *    @inner_mac_header: Link layer header (encapsulation)
 58  *    @transport_header: Transport layer header
 59  *    @network_header: Network layer header
 60  *    @mac_header: Link layer header
 61  *    @tail: Tail pointer
 62  *    @end: End pointer
 63  *    @head: Head of buffer
 64  *    @data: Data head pointer
 65  *    @truesize: Buffer size
 66  *    @users: User count - see {datagram,tcp}.c
 67  */
 68 
 69 struct sk_buff {
 70     /* These two members must be first. */
 71     struct sk_buff        *next;
 72     struct sk_buff        *prev;
 73 
 74     ktime_t            tstamp;
 75 
 76     struct sock        *sk;
 77     struct net_device    *dev;
 78 
 79     /*
 80      * This is the control buffer. It is free to use for every
 81      * layer. Please put your private variables there. If you
 82      * want to keep them across layers you have to do a skb_clone()
 83      * first. This is owned by whoever has the skb queued ATM.
 84      */
 85     char            cb[48] __aligned(8);
 86 
 87     unsigned long        _skb_refdst;
 88 #ifdef CONFIG_XFRM
 89     struct    sec_path    *sp;
 90 #endif
 91     unsigned int        len,
 92                 data_len;
 93     __u16            mac_len,
 94                 hdr_len;
 95     union {
 96         __wsum        csum;
 97         struct {
 98             __u16    csum_start;
 99             __u16    csum_offset;
100         };
101     };
102     __u32            priority;
103     kmemcheck_bitfield_begin(flags1);
104     __u8            local_df:1,
105                 cloned:1,
106                 ip_summed:2,
107                 nohdr:1,
108                 nfctinfo:3;
109     __u8            pkt_type:3,
110                 fclone:2,
111                 ipvs_property:1,
112                 peeked:1,
113                 nf_trace:1;
114     kmemcheck_bitfield_end(flags1);
115     __be16            protocol;
116 
117     void            (*destructor)(struct sk_buff *skb);
118 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
119     struct nf_conntrack    *nfct;
120 #endif
121 #ifdef CONFIG_BRIDGE_NETFILTER
122     struct nf_bridge_info    *nf_bridge;
123 #endif
124 
125     int            skb_iif;
126 
127     __u32            rxhash;
128 
129     __be16            vlan_proto;
130     __u16            vlan_tci;
131 
132 #ifdef CONFIG_NET_SCHED
133     __u16            tc_index;    /* traffic control index */
134 #ifdef CONFIG_NET_CLS_ACT
135     __u16            tc_verd;    /* traffic control verdict */
136 #endif
137 #endif
138 
139     __u16            queue_mapping;
140     kmemcheck_bitfield_begin(flags2);
141 #ifdef CONFIG_IPV6_NDISC_NODETYPE
142     __u8            ndisc_nodetype:2;
143 #endif
144     __u8            pfmemalloc:1;
145     __u8            ooo_okay:1;
146     __u8            l4_rxhash:1;
147     __u8            wifi_acked_valid:1;
148     __u8            wifi_acked:1;
149     __u8            no_fcs:1;
150     __u8            head_frag:1;
151     /* Encapsulation protocol and NIC drivers should use
152      * this flag to indicate to each other if the skb contains
153      * encapsulated packet or not and maybe use the inner packet
154      * headers if needed
155      */
156     __u8            encapsulation:1;
157     /* 6/8 bit hole (depending on ndisc_nodetype presence) */
158     kmemcheck_bitfield_end(flags2);
159 
160 #if defined CONFIG_NET_DMA || defined CONFIG_NET_RX_BUSY_POLL
161     union {
162         unsigned int    napi_id;
163         dma_cookie_t    dma_cookie;
164     };
165 #endif
166 #ifdef CONFIG_NETWORK_SECMARK
167     __u32            secmark;
168 #endif
169     union {
170         __u32        mark;
171         __u32        dropcount;
172         __u32        reserved_tailroom;
173     };
174 
175     __be16            inner_protocol;
176     __u16            inner_transport_header;
177     __u16            inner_network_header;
178     __u16            inner_mac_header;
179     __u16            transport_header;
180     __u16            network_header;
181     __u16            mac_header;
182     /* These elements must be at the end, see alloc_skb() for details.  */
183     sk_buff_data_t        tail;
184     sk_buff_data_t        end;
185     unsigned char        *head,
186                 *data;
187     unsigned int        truesize;
188     atomic_t        users;
189 };
View Code

 

posted @ 2016-11-01 15:55  oliver.wang  阅读(1105)  评论(0编辑  收藏  举报