qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [RFC PATCH V5 2/4] colo-compare: track connection and e


From: Jason Wang
Subject: Re: [Qemu-devel] [RFC PATCH V5 2/4] colo-compare: track connection and enqueue packet
Date: Mon, 11 Jul 2016 13:41:09 +0800
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.8.0



On 2016年07月08日 17:56, Zhang Chen wrote:


On 07/08/2016 12:07 PM, Jason Wang wrote:


On 2016年06月23日 19:34, Zhang Chen wrote:
In this patch we use kernel jhash table to track
connection, and then enqueue net packet like this:

+ CompareState ++
|               |
+---------------+   +---------------+ +---------------+
|conn list      +--->conn +--------->conn           |
+---------------+   +---------------+ +---------------+
|               |     |           |             |          |
+---------------+ +---v----+  +---v----+    +---v----+ +---v----+
                   |primary |  |secondary    |primary | |secondary
                   |packet  |  |packet  +    |packet  | |packet +
                   +--------+  +--------+    +--------+ +--------+
                       |           |             |          |
                   +---v----+  +---v----+    +---v----+ +---v----+
                   |primary |  |secondary    |primary | |secondary
                   |packet  |  |packet  +    |packet  | |packet +
                   +--------+  +--------+    +--------+ +--------+
                       |           |             |          |
                   +---v----+  +---v----+    +---v----+ +---v----+
                   |primary |  |secondary    |primary | |secondary
                   |packet  |  |packet  +    |packet  | |packet +
                   +--------+  +--------+    +--------+ +--------+

A paragraph to describe the above would be more than welcomed.

I will add some comments for it.


Signed-off-by: Zhang Chen <address@hidden>
Signed-off-by: Li Zhijian <address@hidden>
Signed-off-by: Wen Congyang <address@hidden>
---
  include/qemu/jhash.h |  61 ++++++++++++++++
  net/Makefile.objs    |   1 +
net/colo-base.c | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++
  net/colo-base.h      |  88 +++++++++++++++++++++++
  net/colo-compare.c   | 138 +++++++++++++++++++++++++++++++++++-
  trace-events         |   3 +
  6 files changed, 483 insertions(+), 2 deletions(-)
  create mode 100644 include/qemu/jhash.h
  create mode 100644 net/colo-base.c
  create mode 100644 net/colo-base.h

diff --git a/include/qemu/jhash.h b/include/qemu/jhash.h
new file mode 100644
index 0000000..0fcd875
--- /dev/null
+++ b/include/qemu/jhash.h
@@ -0,0 +1,61 @@
+/* jhash.h: Jenkins hash support.
+  *
+  * Copyright (C) 2006. Bob Jenkins (address@hidden)
+  *
+  * http://burtleburtle.net/bob/hash/
+  *
+  * These are the credits from Bob's sources:
+  *
+  * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+  *
+ * These are functions for producing 32-bit hashes for hash table lookup. + * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
+  * are externally useful functions.  Routines to test the hash are
+included
+  * if SELF_TEST is defined.  You can use this free for any purpose.
+It's in
+  * the public domain.  It has no warranty.
+  *
+ * Copyright (C) 2009-2010 Jozsef Kadlecsik (address@hidden)
+  *
+  * I've modified Bob's hash to be useful in the Linux kernel, and
+  * any bugs present are my fault.
+  * Jozsef
+  */
+
+#ifndef QEMU_JHASH_H__
+#define QEMU_JHASH_H__
+
+#include "qemu/bitops.h"
+
+/*
+ * hashtable relation copy from linux kernel jhash
+ */
+
+/* __jhash_mix -- mix 3 32-bit values reversibly. */
+#define __jhash_mix(a, b, c)                \
+{                                           \
+    a -= c;  a ^= rol32(c, 4);  c += b;     \
+    b -= a;  b ^= rol32(a, 6);  a += c;     \
+    c -= b;  c ^= rol32(b, 8);  b += a;     \
+    a -= c;  a ^= rol32(c, 16); c += b;     \
+    b -= a;  b ^= rol32(a, 19); a += c;     \
+    c -= b;  c ^= rol32(b, 4);  b += a;     \
+}
+
+/* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
+#define __jhash_final(a, b, c)  \
+{                               \
+    c ^= b; c -= rol32(b, 14);  \
+    a ^= c; a -= rol32(c, 11);  \
+    b ^= a; b -= rol32(a, 25);  \
+    c ^= b; c -= rol32(b, 16);  \
+    a ^= c; a -= rol32(c, 4);   \
+    b ^= a; b -= rol32(a, 14);  \
+    c ^= b; c -= rol32(b, 24);  \
+}
+
+/* An arbitrary initial parameter */
+#define JHASH_INITVAL           0xdeadbeef
+
+#endif /* QEMU_JHASH_H__ */

Please split jhash into another patch.

Split to a independent patch in this patch set or not?

Better this series since it was the first user.




diff --git a/net/Makefile.objs b/net/Makefile.objs
index ba92f73..119589f 100644
--- a/net/Makefile.objs
+++ b/net/Makefile.objs
@@ -17,3 +17,4 @@ common-obj-y += filter.o
  common-obj-y += filter-buffer.o
  common-obj-y += filter-mirror.o
  common-obj-y += colo-compare.o
+common-obj-y += colo-base.o
diff --git a/net/colo-base.c b/net/colo-base.c
new file mode 100644
index 0000000..7e263e8
--- /dev/null
+++ b/net/colo-base.c
@@ -0,0 +1,194 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Author: Zhang Chen <address@hidden>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "net/colo-base.h"
+
+uint32_t connection_key_hash(const void *opaque)
+{
+    const ConnectionKey *key = opaque;
+    uint32_t a, b, c;
+
+    /* Jenkins hash */
+    a = b = c = JHASH_INITVAL + sizeof(*key);
+    a += key->src.s_addr;
+    b += key->dst.s_addr;
+    c += (key->src_port | key->dst_port << 16);
+    __jhash_mix(a, b, c);
+
+    a += key->ip_proto;
+    __jhash_final(a, b, c);
+
+    return c;
+}
+
+int connection_key_equal(const void *key1, const void *key2)
+{
+    return memcmp(key1, key2, sizeof(ConnectionKey)) == 0;
+}
+
+int parse_packet_early(Packet *pkt)
+{
+    int network_length;
+    uint8_t *data = pkt->data;
+    uint16_t l3_proto;
+    ssize_t l2hdr_len = eth_get_l2_hdr_length(data);
+
+    if (pkt->size < ETH_HLEN) {
+        error_report("pkt->size < ETH_HLEN");
+        return 1;
+    }
+    pkt->network_layer = data + ETH_HLEN;
+    l3_proto = eth_get_l3_proto(data, l2hdr_len);
+    if (l3_proto != ETH_P_IP) {
+        return 1;
+    }
+
+    network_length = pkt->ip->ip_hl * 4;
+    if (pkt->size < ETH_HLEN + network_length) {
+        error_report("pkt->size < network_layer + network_length");
+        return 1;
+    }
+    pkt->transport_layer = pkt->network_layer + network_length;
+    if (!pkt->transport_layer) {
+        error_report("pkt->transport_layer is valid");
+        return 1;
+    }
+
+    return 0;
+}
+
+void fill_connection_key(Packet *pkt, ConnectionKey *key, int mode)
+{
+    uint32_t tmp_ports;
+
+    key->ip_proto = pkt->ip->ip_p;
+
+    switch (key->ip_proto) {
+    case IPPROTO_TCP:
+    case IPPROTO_UDP:
+    case IPPROTO_DCCP:
+    case IPPROTO_ESP:
+    case IPPROTO_SCTP:
+    case IPPROTO_UDPLITE:
+        tmp_ports = *(uint32_t *)(pkt->transport_layer);
+        if (mode) {

Looks like mode is unnecessary here, you can actually compare and swap duing hashing to avoid mode here.

I get your point.


+            key->src = pkt->ip->ip_src;
+            key->dst = pkt->ip->ip_dst;
+            key->src_port = ntohs(tmp_ports & 0xffff);
+            key->dst_port = ntohs(tmp_ports >> 16);
+        } else {
+            key->dst = pkt->ip->ip_src;
+            key->src = pkt->ip->ip_dst;
+            key->dst_port = ntohs(tmp_ports & 0xffff);
+            key->src_port = ntohs(tmp_ports >> 16);
+        }
+        break;
+    case IPPROTO_AH:
+        tmp_ports = *(uint32_t *)(pkt->transport_layer + 4);
+        if (mode) {
+            key->src = pkt->ip->ip_src;
+            key->dst = pkt->ip->ip_dst;
+            key->src_port = ntohs(tmp_ports & 0xffff);
+            key->dst_port = ntohs(tmp_ports >> 16);
+        } else {
+            key->dst = pkt->ip->ip_src;
+            key->src = pkt->ip->ip_dst;
+            key->dst_port = ntohs(tmp_ports & 0xffff);
+            key->src_port = ntohs(tmp_ports >> 16);
+        }
+        break;
+    default:
+        key->src_port = 0;
+        key->dst_port = 0;
+        break;
+    }
+}

This seems could be reused, please use a independent patch for connection key stuffs.

In this patch set or not?
If not, we make a new .c and .h for this?


Yes, this series please.


+
+Connection *connection_new(ConnectionKey *key)
+{
+    Connection *conn = g_slice_new(Connection);
+
+    conn->ip_proto = key->ip_proto;
+    conn->processing = false;
+    g_queue_init(&conn->primary_list);
+    g_queue_init(&conn->secondary_list);
+
+    return conn;
+}
+
+void connection_destroy(void *opaque)
+{
+    Connection *conn = opaque;
+
+    g_queue_foreach(&conn->primary_list, packet_destroy, NULL);
+    g_queue_free(&conn->primary_list);
+    g_queue_foreach(&conn->secondary_list, packet_destroy, NULL);
+    g_queue_free(&conn->secondary_list);
+    g_slice_free(Connection, conn);
+}
+
+Packet *packet_new(const void *data, int size)
+{
+    Packet *pkt = g_slice_new(Packet);
+
+    pkt->data = g_memdup(data, size);
+    pkt->size = size;
+
+    return pkt;
+}
+
+void packet_destroy(void *opaque, void *user_data)
+{
+    Packet *pkt = opaque;
+
+    g_free(pkt->data);
+    g_slice_free(Packet, pkt);
+}
+
+/*
+ * Clear hashtable, stop this hash growing really huge
+ */
+void connection_hashtable_reset(GHashTable *connection_track_table)
+{
+    g_hash_table_remove_all(connection_track_table);
+}
+
+/* if not found, create a new connection and add to hash table */
+Connection *connection_get(GHashTable *connection_track_table,
+                           ConnectionKey *key,
+                           uint32_t *hashtable_size)
+{
+    /* FIXME: protect connection_track_table */

I fail to understand why need protection here.

No need this...will remove it.


+ Connection *conn = g_hash_table_lookup(connection_track_table, key);
+
+    if (conn == NULL) {
+        ConnectionKey *new_key = g_memdup(key, sizeof(*key));
+
+        conn = connection_new(key);
+
+        (*hashtable_size) += 1;
+        if (*hashtable_size > HASHTABLE_MAX_SIZE) {
+ error_report("colo proxy connection hashtable full, clear it");

Is this a hint that we need a synchronization?

NO...we needn't.


But you reset the hash table which means we lose the status of packet comparing?


+ connection_hashtable_reset(connection_track_table);
+            *hashtable_size = 0;
+            /* TODO:clear conn_list */

If we don't clear conn_list, looks like a bug, so probably need to do this in this patch.

OK~~


+        }
+
+        g_hash_table_insert(connection_track_table, new_key, conn);
+    }
+
+    return conn;
+}
diff --git a/net/colo-base.h b/net/colo-base.h
new file mode 100644
index 0000000..01c1a5d
--- /dev/null
+++ b/net/colo-base.h
@@ -0,0 +1,88 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Author: Zhang Chen <address@hidden>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_COLO_BASE_H
+#define QEMU_COLO_BASE_H
+
+#include "slirp/slirp.h"
+#include "qemu/jhash.h"
+#include "qemu/rcu.h"

Don't see any rcu usage in this patch.

will remove it.


+
+#define HASHTABLE_MAX_SIZE 16384
+
+typedef enum colo_conn_state {

This looks like can only take care of TCP, so probably add "tcp" in its name.

yes.


+     COLO_CONN_IDLE,
+
+    /* States on the primary: For incoming connection */
+     COLO_CONN_PRI_IN_SYN,   /* Received Syn */
+ COLO_CONN_PRI_IN_PSYNACK, /* Received syn/ack from primary, but not
+                                yet from secondary */
+     COLO_CONN_PRI_IN_SSYNACK, /* Received syn/ack from secondary, but
+                                  not yet from primary */
+     COLO_CONN_PRI_IN_SYNACK,  /* Received syn/ack from both */
+     COLO_CONN_PRI_IN_ESTABLISHED, /* Got the ACK */
+
+    /* States on the secondary: For incoming connection */
+     COLO_CONN_SEC_IN_SYNACK,      /* We sent a syn/ack */
+ COLO_CONN_SEC_IN_ACK, /* Saw the ack but didn't yet see our syn/ack */
+     COLO_CONN_SEC_IN_ESTABLISHED, /* Got the ACK from the outside */

Should we care about any FIN state here?

Currently we don't care.


Then a comment to explain why only care the stated during connection establishment will be better.


+} colo_conn_state;
+
+typedef struct Packet {
+    void *data;
+    union {
+        uint8_t *network_layer;
+        struct ip *ip;
+    };
+    uint8_t *transport_layer;
+    int size;
+} Packet;

We may start to consider shares codes between e.g hw/net/net_tx_pkt.c.

I read it.the file be added to qemu a mouth ago.
it need time to be stable.maybe it will change.
So I think this job should be do after colo-compare be merged...

Ok, but we need to avoid duplications as much as possible.



+
+typedef struct ConnectionKey {
+ /* (src, dst) must be grouped, in the same way than in IP header */
+    struct in_addr src;
+    struct in_addr dst;
+    uint16_t src_port;
+    uint16_t dst_port;
+    uint8_t ip_proto;
+} QEMU_PACKED ConnectionKey;
+
+typedef struct Connection {
+    /* connection primary send queue: element type: Packet */
+    GQueue primary_list;
+    /* connection secondary send queue: element type: Packet */
+    GQueue secondary_list;
+    /* flag to enqueue unprocessed_connections */
+    bool processing;
+    uint8_t ip_proto;
+    /* be used by filter-rewriter */
+    colo_conn_state state;
+    tcp_seq  primary_seq;
+    tcp_seq  secondary_seq;
+} Connection;
+
+uint32_t connection_key_hash(const void *opaque);
+int connection_key_equal(const void *opaque1, const void *opaque2);
+int parse_packet_early(Packet *pkt);
+void fill_connection_key(Packet *pkt, ConnectionKey *key, int mode);
+Connection *connection_new(ConnectionKey *key);
+void connection_destroy(void *opaque);
+Connection *connection_get(GHashTable *connection_track_table,
+                           ConnectionKey *key,
+                           uint32_t *hashtable_size);
+void connection_hashtable_reset(GHashTable *connection_track_table);
+Packet *packet_new(const void *data, int size);
+void packet_destroy(void *opaque, void *user_data);
+
+#endif /* QEMU_COLO_BASE_H */
diff --git a/net/colo-compare.c b/net/colo-compare.c
index a3e1456..4231fe7 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -28,6 +28,7 @@
  #include "qemu/sockets.h"
  #include "qapi-visit.h"
  #include "trace.h"
+#include "net/colo-base.h"
    #define TYPE_COLO_COMPARE "colo-compare"
  #define COLO_COMPARE(obj) \
@@ -38,6 +39,28 @@
  static QTAILQ_HEAD(, CompareState) net_compares =
         QTAILQ_HEAD_INITIALIZER(net_compares);
  +/*
+  + CompareState ++
+  |               |
+  +---------------+   +---------------+ +---------------+
+  |conn list      +--->conn +--------->conn           |
+  +---------------+   +---------------+ +---------------+
+  |               |     |           |             | |
+  +---------------+ +---v----+  +---v----+    +---v----+ +---v----+
+                    |primary |  |secondary    |primary | |secondary
+                    |packet  |  |packet  +    |packet  | |packet  +
+                    +--------+  +--------+    +--------+ +--------+
+                        |           |             | |
+                    +---v----+  +---v----+    +---v----+ +---v----+
+                    |primary |  |secondary    |primary | |secondary
+                    |packet  |  |packet  +    |packet  | |packet  +
+                    +--------+  +--------+    +--------+ +--------+
+                        |           |             | |
+                    +---v----+  +---v----+    +---v----+ +---v----+
+                    |primary |  |secondary    |primary | |secondary
+                    |packet  |  |packet  +    |packet  | |packet  +
+                    +--------+  +--------+    +--------+ +--------+
+*/
  typedef struct CompareState {
      Object parent;
  @@ -50,12 +73,103 @@ typedef struct CompareState {
      QTAILQ_ENTRY(CompareState) next;
      SocketReadState pri_rs;
      SocketReadState sec_rs;
+
+ /* connection list: the connections belonged to this NIC could be found
+     * in this list.
+     * element type: Connection
+     */
+    GQueue conn_list;
+    QemuMutex conn_list_lock; /* to protect conn_list */

Why need this mutex?

will remove it.


+    /* hashtable to save connection */
+    GHashTable *connection_track_table;
+    /* to save unprocessed_connections */
+    GQueue unprocessed_connections;
+    /* proxy current hash size */
+    uint32_t hashtable_size;
  } CompareState;
    typedef struct CompareClass {
      ObjectClass parent_class;
  } CompareClass;
  +enum {
+    PRIMARY_IN = 0,
+    SECONDARY_IN,
+};
+
+static int compare_chr_send(CharDriverState *out,
+                            const uint8_t *buf,
+                            uint32_t size);
+
+/*
+ * Return 0 on success, if return -1 means the pkt
+ * is unsupported(arp and ipv6) and will be sent later
+ */
+static int packet_enqueue(CompareState *s, int mode)
+{
+    ConnectionKey key = {{ 0 } };
+    Packet *pkt = NULL;
+    Connection *conn;
+
+    if (mode == PRIMARY_IN) {
+        pkt = packet_new(s->pri_rs.buf, s->pri_rs.packet_len);
+    } else {
+        pkt = packet_new(s->sec_rs.buf, s->sec_rs.packet_len);
+    }
+
+    if (parse_packet_early(pkt)) {
+        packet_destroy(pkt, NULL);
+        pkt = NULL;
+        return -1;
+    }
+    fill_connection_key(pkt, &key, PRIMARY_IN);
+
+    conn = connection_get(s->connection_track_table,
+                          &key,
+                          &s->hashtable_size);
+    if (!conn->processing) {
+        qemu_mutex_lock(&s->conn_list_lock);
+        g_queue_push_tail(&s->conn_list, conn);
+        qemu_mutex_unlock(&s->conn_list_lock);
+        conn->processing = true;
+    }
+
+    if (mode == PRIMARY_IN) {
+        g_queue_push_tail(&conn->primary_list, pkt);
+    } else {
+        g_queue_push_tail(&conn->secondary_list, pkt);
+    }
+
+    return 0;
+}
+
+static int compare_chr_send(CharDriverState *out,
+                            const uint8_t *buf,
+                            uint32_t size)
+{
+    int ret = 0;
+    uint32_t len = htonl(size);
+
+    if (!size) {
+        return 0;
+    }
+
+    ret = qemu_chr_fe_write_all(out, (uint8_t *)&len, sizeof(len));
+    if (ret != sizeof(len)) {
+        goto err;
+    }
+
+    ret = qemu_chr_fe_write_all(out, (uint8_t *)buf, size);
+    if (ret != size) {
+        goto err;
+    }
+
+    return 0;
+
+err:
+    return ret < 0 ? ret : -EIO;
+}
+
  static char *compare_get_pri_indev(Object *obj, Error **errp)
  {
      CompareState *s = COLO_COMPARE(obj);
@@ -103,12 +217,21 @@ static void compare_set_outdev(Object *obj, const char *value, Error **errp)
    static void compare_pri_rs_finalize(SocketReadState *pri_rs)
  {
- /* if packet_enqueue pri pkt failed we will send unsupported packet */
+    CompareState *s = container_of(pri_rs, CompareState, pri_rs);
+
+    if (packet_enqueue(s, PRIMARY_IN)) {
+        trace_colo_compare_main("primary: unsupported packet in");
+        compare_chr_send(s->chr_out, pri_rs->buf, pri_rs->packet_len);
+    }

Do we have a upper limit on the maximum numbers of packets could be queued? If not, guest may easily trigger OOM.

We need a g_queue to do this job?

Maybe.

It upper than the limit we drop the packet?

Thanks
Zhang Chen

Needs more thought, but we could start from dropping packets.



  }
    static void compare_sec_rs_finalize(SocketReadState *sec_rs)
  {
-    /* if packet_enqueue sec pkt failed we will notify trace */
+    CompareState *s = container_of(sec_rs, CompareState, sec_rs);
+
+    if (packet_enqueue(s, SECONDARY_IN)) {
+        trace_colo_compare_main("secondary: unsupported packet in");
+    }
  }
    /*
@@ -161,6 +284,15 @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
      net_socket_rs_init(&s->pri_rs, compare_pri_rs_finalize);
      net_socket_rs_init(&s->sec_rs, compare_sec_rs_finalize);
  +    g_queue_init(&s->conn_list);
+    qemu_mutex_init(&s->conn_list_lock);
+    s->hashtable_size = 0;
+
+ s->connection_track_table = g_hash_table_new_full(connection_key_hash,
+ connection_key_equal,
+                                                      g_free,
+ connection_destroy);
+
      return;
  }
  @@ -203,6 +335,8 @@ static void colo_compare_finalize(Object *obj)
      if (!QTAILQ_EMPTY(&net_compares)) {
          QTAILQ_REMOVE(&net_compares, s, next);
      }
+    qemu_mutex_destroy(&s->conn_list_lock);
+    g_queue_free(&s->conn_list);
        g_free(s->pri_indev);
      g_free(s->sec_indev);
diff --git a/trace-events b/trace-events
index ca7211b..703de1a 100644
--- a/trace-events
+++ b/trace-events
@@ -1916,3 +1916,6 @@ aspeed_vic_update_fiq(int flags) "Raising FIQ: %d"
  aspeed_vic_update_irq(int flags) "Raising IRQ: %d"
aspeed_vic_read(uint64_t offset, unsigned size, uint32_t value) "From 0x%" PRIx64 " of size %u: 0x%" PRIx32 aspeed_vic_write(uint64_t offset, unsigned size, uint32_t data) "To 0x%" PRIx64 " of size %u: 0x%" PRIx32
+
+# net/colo-compare.c
+colo_compare_main(const char *chr) ": %s"



.






reply via email to

[Prev in Thread] Current Thread [Next in Thread]