commit 8d1f0719776e92837ef3ab3f2b895f057e4a9c36 Author: Pieter Noordhuis Date: Tue Aug 16 19:31:14 2016 -0700 Support IPv6 link-local addresses Link-local addresses use a single prefix (fe:80) so a routing table doesn't help figuring out which interface to transmit it on. The Linux IPv6 implementation asks application developers to populate the `sin6_scope_id` field on the `sockaddr_in6` struct with the index of the interface to communicate on. diff --git a/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp.h b/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp.h index 5279c09..ae2c5c6 100644 --- a/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp.h +++ b/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp.h @@ -78,8 +78,10 @@ struct mca_btl_tcp_component_t { opal_event_t tcp6_recv_event; /**< recv event for IPv6 listen socket */ int tcp6_listen_sd; /**< IPv6 listen socket for incoming connection requests */ unsigned short tcp6_listen_port; /**< IPv6 listen port */ - int tcp6_port_min; /**< IPv4 minimum port */ - int tcp6_port_range; /**< IPv4 port range */ + int tcp6_port_min; /**< IPv6 minimum port */ + int tcp6_port_range; /**< IPv6 port range */ + bool tcp6_use_link_local; /**< Enable use of IPv6 link-local addresses */ + int tcp6_link_local_scope_id; /**< Kernel index of interface for link-local traffic */ #endif /* Port range restriction */ diff --git a/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_component.c b/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_component.c index 59a3a48..8bbbb96 100644 --- a/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_component.c +++ b/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_component.c @@ -160,6 +160,20 @@ static inline unsigned int mca_btl_tcp_param_register_uint( return *storage; } +static inline unsigned int mca_btl_tcp_param_register_bool( + const char* param_name, + const char* help_string, + bool default_value, + int level, + bool *storage) +{ + *storage = default_value; + (void) mca_base_component_var_register(&mca_btl_tcp_component.super.btl_version, + param_name, help_string, MCA_BASE_VAR_TYPE_BOOL, + NULL, 0, 0, level, + MCA_BASE_VAR_SCOPE_READONLY, storage); + return *storage; +} /* * Data structure for accepting connections. @@ -254,6 +268,10 @@ static int mca_btl_tcp_component_register(void) (0x1 << 16) - mca_btl_tcp_component.tcp6_port_min - 1, OPAL_INFO_LVL_2, &mca_btl_tcp_component.tcp6_port_range ); free(message); + + mca_btl_tcp_param_register_bool("enable_ipv6_link_local", + "Whether to enable use of IPv6 link-local addresses (default: false)", false, + OPAL_INFO_LVL_2, &mca_btl_tcp_component.tcp6_use_link_local); #endif mca_btl_tcp_component.report_all_unfound_interfaces = false; @@ -677,7 +695,7 @@ static int mca_btl_tcp_component_create_instances(void) and therefore we're done. */ if (mca_btl_tcp_component.tcp_num_btls > 0) { ret = OMPI_SUCCESS; - goto cleanup; + goto check; } /* if the interface list was not specified by the user, create @@ -709,7 +727,91 @@ static int mca_btl_tcp_component_create_instances(void) } opal_argv_free(exclude); - cleanup: + check: +#if OPAL_ENABLE_IPV6 + /* If using IPv6 link-local addresses is OK, we need to verify + * we have only a single BTL instance with a link-local address, + * or they will be ambiguous. */ + if (!mca_btl_tcp_component.tcp6_use_link_local) { + goto cleanup; + } + + { + int link_local_ifkindex = -1; + unsigned int btl_index; + + for (btl_index = 0; + btl_index < mca_btl_tcp_component.tcp_num_btls; + btl_index++) { + for (if_index = opal_ifbegin(); + if_index >= 0; + if_index = opal_ifnext(if_index)) { + /* IF_NAMESIZE is defined in opal/util/if.h */ + char if_name[IF_NAMESIZE]; + struct sockaddr_in6 ss; + + if (opal_ifindextokindex(if_index) != + mca_btl_tcp_component.tcp_btls[btl_index]->tcp_ifkindex) { + continue; + } + + ret = opal_ifindextoaddr(if_index, + (struct sockaddr*) &ss, + sizeof(ss)); + if (ret != OPAL_SUCCESS) { + opal_output (0, + "btl_tcp_component: " + "unable to get address for " + "index %i (kernel index %i)", + if_index, + opal_ifindextokindex(if_index)); + goto cleanup; + } + + /* Ignore non-IPv6 addresses */ + if (ss.sin6_family != AF_INET6) { + continue; + } + + /* Ignore addresses other than link-local */ + if (ss.sin6_scope_id != 0x20) { + continue; + } + + /* + * Error if there are multiple interfaces with + * a link-local address (they will be ambiguous). + */ + if (link_local_ifkindex >= 0) { + opal_output (0, + "btl_tcp_component: " + "multiple link-local addresses found"); + ret = OMPI_ERROR; + goto cleanup; + } + + link_local_ifkindex = opal_ifindextokindex(if_index); + opal_ifindextoname(if_index, if_name, sizeof(if_name)); + opal_output (0, + "btl_tcp_component: " + "using %s for link-local traffic", + if_name); + } + } + + if (link_local_ifkindex < 0) { + opal_output (0, + "btl_tcp_component: " + "no link-local addresses found"); + ret = OMPI_ERROR; + goto cleanup; + } + + mca_btl_tcp_component.tcp6_link_local_scope_id = link_local_ifkindex; + } +#endif + +cleanup: if (NULL != kindexes) { free(kindexes); } diff --git a/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_proc.c b/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_proc.c index 89aee88..21e4e50 100644 --- a/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_proc.c +++ b/1.10.2/src/openmpi-1.10.2/ompi/mca/btl/tcp/btl_tcp_proc.c @@ -798,6 +798,18 @@ bool mca_btl_tcp_proc_tosocks(mca_btl_tcp_addr_t* proc_addr, inaddr->sin6_port = proc_addr->addr_port; inaddr->sin6_scope_id = 0; inaddr->sin6_flowinfo = 0; + + /* + * If this is a link-local address AND the component is configured + * to allow link-local addresses for BTL traffic, set the + * scope_id so the kernel passes it to the right network interface. + */ + if ((inaddr->sin6_addr.s6_addr[0] & 0xff) == 0xfe && + (inaddr->sin6_addr.s6_addr[1] & 0xc0) == 0x80 && + mca_btl_tcp_component.tcp6_use_link_local) { + inaddr->sin6_scope_id = + mca_btl_tcp_component.tcp6_link_local_scope_id; + } } break; #endif diff --git a/1.10.2/src/openmpi-1.10.2/opal/mca/if/linux_ipv6/if_linux_ipv6.c b/1.10.2/src/openmpi-1.10.2/opal/mca/if/linux_ipv6/if_linux_ipv6.c index 2832371..ffa0b8e 100644 --- a/1.10.2/src/openmpi-1.10.2/opal/mca/if/linux_ipv6/if_linux_ipv6.c +++ b/1.10.2/src/openmpi-1.10.2/opal/mca/if/linux_ipv6/if_linux_ipv6.c @@ -118,8 +118,8 @@ static int if_linux_ipv6_open(void) addrbyte[8], addrbyte[9], addrbyte[10], addrbyte[11], addrbyte[12], addrbyte[13], addrbyte[14], addrbyte[15], scope); - /* Only interested in global (0x00) scope */ - if (scope != 0x00) { + /* Only interested in global (0x00) and link-local (0x20) scope */ + if (scope != 0x00 && scope != 0x20) { opal_output_verbose(1, opal_if_base_framework.framework_output, "skipping interface %2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x:%2x%2x scope %x\n", addrbyte[0], addrbyte[1], addrbyte[2], addrbyte[3], diff --git a/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_component.c b/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_component.c index 498a42d..5b453ca 100644 --- a/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_component.c +++ b/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_component.c @@ -438,6 +438,17 @@ static int tcp_component_register(void) OPAL_INFO_LVL_6, MCA_BASE_VAR_SCOPE_READONLY, &mca_oob_tcp_component.skip_version_check); + +#if OPAL_ENABLE_IPV6 + mca_oob_tcp_component.tcp6_use_link_local = false; + mca_oob_tcp_component.tcp6_link_local_scope_id = -1; + (void)mca_base_component_var_register(component, "enable_ipv6_link_local", + "Whether to enable use of IPv6 link-local addresses (default: false)", + MCA_BASE_VAR_TYPE_BOOL, NULL, 0, 0, + OPAL_INFO_LVL_2, + MCA_BASE_VAR_SCOPE_READONLY, + &mca_oob_tcp_component.tcp6_use_link_local); +#endif return ORTE_SUCCESS; } @@ -566,6 +577,12 @@ static bool component_available(void) opal_argv_append_nosize(&mca_oob_tcp_component.ipv4conns, opal_net_get_hostname((struct sockaddr*) &my_ss)); } else if (AF_INET6 == my_ss.ss_family) { #if OPAL_ENABLE_IPV6 + /* If this address has link local scope, capture its interface index */ + if (mca_oob_tcp_component.tcp6_use_link_local && + ((struct sockaddr_in6*) &my_ss)->sin6_scope_id == 0x20) { + mca_oob_tcp_component.tcp6_link_local_scope_id = kindex; + } + opal_output_verbose(10, orte_oob_base_framework.framework_output, "%s oob:tcp:init adding %s to our list of %s connections", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), diff --git a/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_component.h b/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_component.h index d8d47a2..03182b4 100644 --- a/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_component.h +++ b/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_component.h @@ -62,9 +62,11 @@ typedef struct { #if OPAL_ENABLE_IPV6 /* IPv6 support */ - bool disable_ipv6_family; /**< disable this AF */ - char** tcp6_static_ports; /**< Static ports - IPV6 */ - char** tcp6_dyn_ports; /**< Dynamic ports - IPV6 */ + bool disable_ipv6_family; /**< disable this AF */ + char** tcp6_static_ports; /**< Static ports - IPV6 */ + char** tcp6_dyn_ports; /**< Dynamic ports - IPV6 */ + bool tcp6_use_link_local; /**< Enable use of IPv6 link-local addresses */ + int tcp6_link_local_scope_id; /**< Kernel index of interface for link-local traffic */ char** ipv6conns; char** ipv6ports; #endif diff --git a/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_connection.c b/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_connection.c index 124dc9d..71565ee 100644 --- a/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_connection.c +++ b/1.10.2/src/openmpi-1.10.2/orte/mca/oob/tcp/oob_tcp_connection.c @@ -144,6 +144,19 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata) continue; } +#if OPAL_ENABLE_IPV6 + if (addr->addr.ss_family == AF_INET6) { + struct sockaddr_in6* inaddr = (struct sockaddr_in6*) &addr->addr; + + if ((inaddr->sin6_addr.s6_addr[0] & 0xff) == 0xfe && + (inaddr->sin6_addr.s6_addr[1] & 0xc0) == 0x80 && + mca_oob_tcp_component.tcp6_use_link_local) { + inaddr->sin6_scope_id = + mca_oob_tcp_component.tcp6_link_local_scope_id; + } + } +#endif + addrlen = addr->addr.ss_family == AF_INET6 ? sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in); @@ -172,6 +185,25 @@ void mca_oob_tcp_peer_try_connect(int fd, short args, void *cbdata) CLOSE_THE_SOCKET(peer->sd); continue; } + /* When testing use of IPv6 link-local addresses, the 4.0 + * kernel would immediately return EADDRNOTAVAIL when + * connecting to a link-local address on the same host. + * This appears to be a transient problem that only + * manifests for a short period of time after calling + * listen(2) on the server side of the socket. Therefore, + * inserting a small delay on the client side fixes the + * problem. Since establishing these connections only + * happens at initialization time, a delay is acceptable. + */ + if (EADDRNOTAVAIL == opal_socket_errno) { + opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, + "%s connection to %s returned EADDRNOTAVAIL - retrying after delay", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + ORTE_NAME_PRINT(&peer->name)); + CLOSE_THE_SOCKET(peer->sd); + sleep(1); + continue; + } if (rc < 0) { opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s connection to %s returned %d (%d, %s)",