Skip to content

Instantly share code, notes, and snippets.

@pietern
Created August 17, 2016 02:48
Show Gist options
  • Select an option

  • Save pietern/21c6765849be89f862d93050df7acded to your computer and use it in GitHub Desktop.

Select an option

Save pietern/21c6765849be89f862d93050df7acded to your computer and use it in GitHub Desktop.

Revisions

  1. pietern created this gist Aug 17, 2016.
    309 changes: 309 additions & 0 deletions ompi-ipv6-link-local.patch
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,309 @@
    commit 8d1f0719776e92837ef3ab3f2b895f057e4a9c36
    Author: Pieter Noordhuis <[email protected]>
    Date: Tue Aug 16 19:31:14 2016 -0700

    Support IPv6 link-local addresses

    Link-local addresses use a single prefix (fe:80) so a routing table
    doesn't help figuring out which interface to transmit it on. The Linux
    IPv6 implementation asks application developers to populate the
    `sin6_scope_id` field on the `sockaddr_in6` struct with the index of the
    interface to communicate on.

    diff --git a/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp.h b/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp.h
    index 5279c09..ae2c5c6 100644
    --- a/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp.h
    +++ b/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp.h
    @@ -78,8 +78,10 @@ struct mca_btl_tcp_component_t {
    opal_event_t tcp6_recv_event; /**< recv event for IPv6 listen socket */
    int tcp6_listen_sd; /**< IPv6 listen socket for incoming connection requests */
    unsigned short tcp6_listen_port; /**< IPv6 listen port */
    - int tcp6_port_min; /**< IPv4 minimum port */
    - int tcp6_port_range; /**< IPv4 port range */
    + int tcp6_port_min; /**< IPv6 minimum port */
    + int tcp6_port_range; /**< IPv6 port range */
    + bool tcp6_use_link_local; /**< Enable use of IPv6 link-local addresses */
    + int tcp6_link_local_scope_id; /**< Kernel index of interface for link-local traffic */
    #endif
    /* Port range restriction */

    diff --git a/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_component.c b/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_component.c
    index 59a3a48..8bbbb96 100644
    --- a/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_component.c
    +++ b/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_component.c
    @@ -160,6 +160,20 @@ static inline unsigned int mca_btl_tcp_param_register_uint(
    return *storage;
    }

    +static inline unsigned int mca_btl_tcp_param_register_bool(
    + const char* param_name,
    + const char* help_string,
    + bool default_value,
    + int level,
    + bool *storage)
    +{
    + *storage = default_value;
    + (void) mca_base_component_var_register(&mca_btl_tcp_component.super.btl_version,
    + param_name, help_string, MCA_BASE_VAR_TYPE_BOOL,
    + NULL, 0, 0, level,
    + MCA_BASE_VAR_SCOPE_READONLY, storage);
    + return *storage;
    +}

    /*
    * Data structure for accepting connections.
    @@ -254,6 +268,10 @@ static int mca_btl_tcp_component_register(void)
    (0x1 << 16) - mca_btl_tcp_component.tcp6_port_min - 1,
    OPAL_INFO_LVL_2, &mca_btl_tcp_component.tcp6_port_range );
    free(message);
    +
    + mca_btl_tcp_param_register_bool("enable_ipv6_link_local",
    + "Whether to enable use of IPv6 link-local addresses (default: false)", false,
    + OPAL_INFO_LVL_2, &mca_btl_tcp_component.tcp6_use_link_local);
    #endif

    mca_btl_tcp_component.report_all_unfound_interfaces = false;
    @@ -677,7 +695,7 @@ static int mca_btl_tcp_component_create_instances(void)
    and therefore we're done. */
    if (mca_btl_tcp_component.tcp_num_btls > 0) {
    ret = OMPI_SUCCESS;
    - goto cleanup;
    + goto check;
    }

    /* if the interface list was not specified by the user, create
    @@ -709,7 +727,91 @@ static int mca_btl_tcp_component_create_instances(void)
    }
    opal_argv_free(exclude);

    - cleanup:
    + check:
    +#if OPAL_ENABLE_IPV6
    + /* If using IPv6 link-local addresses is OK, we need to verify
    + * we have only a single BTL instance with a link-local address,
    + * or they will be ambiguous. */
    + if (!mca_btl_tcp_component.tcp6_use_link_local) {
    + goto cleanup;
    + }
    +
    + {
    + int link_local_ifkindex = -1;
    + unsigned int btl_index;
    +
    + for (btl_index = 0;
    + btl_index < mca_btl_tcp_component.tcp_num_btls;
    + btl_index++) {
    + for (if_index = opal_ifbegin();
    + if_index >= 0;
    + if_index = opal_ifnext(if_index)) {
    + /* IF_NAMESIZE is defined in opal/util/if.h */
    + char if_name[IF_NAMESIZE];
    + struct sockaddr_in6 ss;
    +
    + if (opal_ifindextokindex(if_index) !=
    + mca_btl_tcp_component.tcp_btls[btl_index]->tcp_ifkindex) {
    + continue;
    + }
    +
    + ret = opal_ifindextoaddr(if_index,
    + (struct sockaddr*) &ss,
    + sizeof(ss));
    + if (ret != OPAL_SUCCESS) {
    + opal_output (0,
    + "btl_tcp_component: "
    + "unable to get address for "
    + "index %i (kernel index %i)",
    + if_index,
    + opal_ifindextokindex(if_index));
    + goto cleanup;
    + }
    +
    + /* Ignore non-IPv6 addresses */
    + if (ss.sin6_family != AF_INET6) {
    + continue;
    + }
    +
    + /* Ignore addresses other than link-local */
    + if (ss.sin6_scope_id != 0x20) {
    + continue;
    + }
    +
    + /*
    + * Error if there are multiple interfaces with
    + * a link-local address (they will be ambiguous).
    + */
    + if (link_local_ifkindex >= 0) {
    + opal_output (0,
    + "btl_tcp_component: "
    + "multiple link-local addresses found");
    + ret = OMPI_ERROR;
    + goto cleanup;
    + }
    +
    + link_local_ifkindex = opal_ifindextokindex(if_index);
    + opal_ifindextoname(if_index, if_name, sizeof(if_name));
    + opal_output (0,
    + "btl_tcp_component: "
    + "using %s for link-local traffic",
    + if_name);
    + }
    + }
    +
    + if (link_local_ifkindex < 0) {
    + opal_output (0,
    + "btl_tcp_component: "
    + "no link-local addresses found");
    + ret = OMPI_ERROR;
    + goto cleanup;
    + }
    +
    + mca_btl_tcp_component.tcp6_link_local_scope_id = link_local_ifkindex;
    + }
    +#endif
    +
    +cleanup:
    if (NULL != kindexes) {
    free(kindexes);
    }
    diff --git a/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_proc.c b/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_proc.c
    index 89aee88..21e4e50 100644
    --- a/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_proc.c
    +++ b/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_proc.c
    @@ -798,6 +798,18 @@ bool mca_btl_tcp_proc_tosocks(mca_btl_tcp_addr_t* proc_addr,
    inaddr->sin6_port = proc_addr->addr_port;
    inaddr->sin6_scope_id = 0;
    inaddr->sin6_flowinfo = 0;
    +
    + /*
    + * If this is a link-local address AND the component is configured
    + * to allow link-local addresses for BTL traffic, set the
    + * scope_id so the kernel passes it to the right network interface.
    + */
    + if ((inaddr->sin6_addr.s6_addr[0] & 0xff) == 0xfe &&
    + (inaddr->sin6_addr.s6_addr[1] & 0xc0) == 0x80 &&
    + mca_btl_tcp_component.tcp6_use_link_local) {
    + inaddr->sin6_scope_id =
    + mca_btl_tcp_component.tcp6_link_local_scope_id;
    + }
    }
    break;
    #endif
    diff --git a/1.10.2/src/openmpi-1.10.2/opal/mca/if/linux_ipv6/if_linux_ipv6.c b/1.10.2/src/openmpi-1.10.2/opal/mca/if/linux_ipv6/if_linux_ipv6.c
    index 2832371..ffa0b8e 100644
    --- a/1.10.2/src/openmpi-1.10.2/opal/mca/if/linux_ipv6/if_linux_ipv6.c
    +++ b/1.10.2/src/openmpi-1.10.2/opal/mca/if/linux_ipv6/if_linux_ipv6.c
    @@ -118,8 +118,8 @@ static int if_linux_ipv6_open(void)
    addrbyte[8], addrbyte[9], addrbyte[10], addrbyte[11],
    addrbyte[12], addrbyte[13], addrbyte[14], addrbyte[15], scope);

    - /* Only interested in global (0x00) scope */
    - if (scope != 0x00) {
    + /* Only interested in global (0x00) and link-local (0x20) scope */
    + if (scope != 0x00 && scope != 0x20) {
    opal_output_verbose(1, opal_if_base_framework.framework_output,
    "skipping interface %2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x scope %x\n",
    addrbyte[0], addrbyte[1], addrbyte[2], addrbyte[3],
    diff --git a/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_component.c b/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_component.c
    index 498a42d..5b453ca 100644
    --- a/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_component.c
    +++ b/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_component.c
    @@ -438,6 +438,17 @@ static int tcp_component_register(void)
    OPAL_INFO_LVL_6,
    MCA_BASE_VAR_SCOPE_READONLY,
    &mca_oob_tcp_component.skip_version_check);
    +
    +#if OPAL_ENABLE_IPV6
    + mca_oob_tcp_component.tcp6_use_link_local = false;
    + mca_oob_tcp_component.tcp6_link_local_scope_id = -1;
    + (void)mca_base_component_var_register(component, "enable_ipv6_link_local",
    + "Whether to enable use of IPv6 link-local addresses (default: false)",
    + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0,
    + OPAL_INFO_LVL_2,
    + MCA_BASE_VAR_SCOPE_READONLY,
    + &mca_oob_tcp_component.tcp6_use_link_local);
    +#endif
    return ORTE_SUCCESS;
    }

    @@ -566,6 +577,12 @@ static bool component_available(void)
    opal_argv_append_nosize(&mca_oob_tcp_component.ipv4conns, opal_net_get_hostname((struct sockaddr*) &my_ss));
    } else if (AF_INET6 == my_ss.ss_family) {
    #if OPAL_ENABLE_IPV6
    + /* If this address has link local scope, capture its interface index */
    + if (mca_oob_tcp_component.tcp6_use_link_local &&
    + ((struct sockaddr_in6*) &my_ss)->sin6_scope_id == 0x20) {
    + mca_oob_tcp_component.tcp6_link_local_scope_id = kindex;
    + }
    +
    opal_output_verbose(10, orte_oob_base_framework.framework_output,
    "%s oob:tcp:init adding %s to our list of %s connections",
    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
    diff --git a/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_component.h b/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_component.h
    index d8d47a2..03182b4 100644
    --- a/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_component.h
    +++ b/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_component.h
    @@ -62,9 +62,11 @@ typedef struct {

    #if OPAL_ENABLE_IPV6
    /* IPv6 support */
    - bool disable_ipv6_family; /**< disable this AF */
    - char** tcp6_static_ports; /**< Static ports - IPV6 */
    - char** tcp6_dyn_ports; /**< Dynamic ports - IPV6 */
    + bool disable_ipv6_family; /**< disable this AF */
    + char** tcp6_static_ports; /**< Static ports - IPV6 */
    + char** tcp6_dyn_ports; /**< Dynamic ports - IPV6 */
    + bool tcp6_use_link_local; /**< Enable use of IPv6 link-local addresses */
    + int tcp6_link_local_scope_id; /**< Kernel index of interface for link-local traffic */
    char** ipv6conns;
    char** ipv6ports;
    #endif
    diff --git a/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_connection.c b/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_connection.c
    index 124dc9d..71565ee 100644
    --- a/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_connection.c
    +++ b/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_connection.c
    @@ -144,6 +144,19 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
    continue;
    }

    +#if OPAL_ENABLE_IPV6
    + if (addr->addr.ss_family == AF_INET6) {
    + struct sockaddr_in6* inaddr = (struct sockaddr_in6*) &addr->addr;
    +
    + if ((inaddr->sin6_addr.s6_addr[0] & 0xff) == 0xfe &&
    + (inaddr->sin6_addr.s6_addr[1] & 0xc0) == 0x80 &&
    + mca_oob_tcp_component.tcp6_use_link_local) {
    + inaddr->sin6_scope_id =
    + mca_oob_tcp_component.tcp6_link_local_scope_id;
    + }
    + }
    +#endif
    +
    addrlen = addr->addr.ss_family == AF_INET6
    ? sizeof(struct sockaddr_in6)
    : sizeof(struct sockaddr_in);
    @@ -172,6 +185,25 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata)
    CLOSE_THE_SOCKET(peer->sd);
    continue;
    }
    + /* When testing use of IPv6 link-local addresses, the 4.0
    + * kernel would immediately return EADDRNOTAVAIL when
    + * connecting to a link-local address on the same host.
    + * This appears to be a transient problem that only
    + * manifests for a short period of time after calling
    + * listen(2) on the server side of the socket. Therefore,
    + * inserting a small delay on the client side fixes the
    + * problem. Since establishing these connections only
    + * happens at initialization time, a delay is acceptable.
    + */
    + if (EADDRNOTAVAIL == opal_socket_errno) {
    + opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
    + "%s connection to %s returned EADDRNOTAVAIL - retrying after delay",
    + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
    + ORTE_NAME_PRINT(&peer->name));
    + CLOSE_THE_SOCKET(peer->sd);
    + sleep(1);
    + continue;
    + }
    if (rc < 0) {
    opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output,
    "%s connection to %s returned %d (%d, %s)",