/* * Copyright (C) 2007 Michael Brown <mbrown@fensystems.co.uk>. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ FILE_LICENCE ( GPL2_OR_LATER ); #include <stdint.h> #include <stdio.h> #include <unistd.h> #include <string.h> #include <byteswap.h> #include <errno.h> #include <gpxe/errortab.h> #include <gpxe/if_arp.h> #include <gpxe/iobuf.h> #include <gpxe/netdevice.h> #include <gpxe/infiniband.h> #include <gpxe/ib_pathrec.h> #include <gpxe/ib_mcast.h> #include <gpxe/ipoib.h> /** @file * * IP over Infiniband */ /** Number of IPoIB send work queue entries */ #define IPOIB_NUM_SEND_WQES 2 /** Number of IPoIB receive work queue entries */ #define IPOIB_NUM_RECV_WQES 4 /** Number of IPoIB completion entries */ #define IPOIB_NUM_CQES 8 /** An IPoIB device */ struct ipoib_device { /** Network device */ struct net_device *netdev; /** Underlying Infiniband device */ struct ib_device *ibdev; /** Completion queue */ struct ib_completion_queue *cq; /** Queue pair */ struct ib_queue_pair *qp; /** Broadcast MAC */ struct ipoib_mac broadcast; /** Joined to IPv4 broadcast multicast group * * This flag indicates whether or not we have initiated the * join to the IPv4 broadcast multicast group. */ int broadcast_joined; /** IPv4 broadcast multicast group membership */ struct ib_mc_membership broadcast_membership; }; /** Broadcast IPoIB address */ static struct ipoib_mac ipoib_broadcast = { .flags__qpn = htonl ( IB_QPN_BROADCAST ), .gid.u.bytes = { 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }, }; /** Link status for "broadcast join in progress" */ #define EINPROGRESS_JOINING ( EINPROGRESS | EUNIQ_01 ) /** Human-readable message for the link status */ struct errortab ipoib_errors[] __errortab = { { EINPROGRESS_JOINING, "Joining" }, }; /**************************************************************************** * * IPoIB peer cache * **************************************************************************** */ /** * IPoIB peer address * * The IPoIB link-layer header is only four bytes long and so does not * have sufficient room to store IPoIB MAC address(es). We therefore * maintain a cache of MAC addresses identified by a single-byte key, * and abuse the spare two bytes within the link-layer header to * communicate these MAC addresses between the link-layer code and the * netdevice driver. */ struct ipoib_peer { /** Key */ uint8_t key; /** MAC address */ struct ipoib_mac mac; }; /** Number of IPoIB peer cache entries * * Must be a power of two. */ #define IPOIB_NUM_CACHED_PEERS 4 /** IPoIB peer address cache */ static struct ipoib_peer ipoib_peer_cache[IPOIB_NUM_CACHED_PEERS]; /** Oldest IPoIB peer cache entry index */ static unsigned int ipoib_peer_cache_idx = 1; /** * Look up cached peer by key * * @v key Peer cache key * @ret peer Peer cache entry, or NULL */ static struct ipoib_peer * ipoib_lookup_peer_by_key ( unsigned int key ) { struct ipoib_peer *peer; unsigned int i; for ( i = 0 ; i < IPOIB_NUM_CACHED_PEERS ; i++ ) { peer = &ipoib_peer_cache[i]; if ( peer->key == key ) return peer; } if ( key != 0 ) { DBG ( "IPoIB warning: peer cache lost track of key %x while " "still in use\n", key ); } return NULL; } /** * Store GID and QPN in peer cache * * @v mac Peer MAC address * @ret peer Peer cache entry */ static struct ipoib_peer * ipoib_cache_peer ( const struct ipoib_mac *mac ) { struct ipoib_peer *peer; unsigned int key; unsigned int i; /* Look for existing cache entry */ for ( i = 0 ; i < IPOIB_NUM_CACHED_PEERS ; i++ ) { peer = &ipoib_peer_cache[i]; if ( memcmp ( &peer->mac, mac, sizeof ( peer->mac ) ) == 0 ) return peer; } /* No entry found: create a new one */ key = ipoib_peer_cache_idx++; peer = &ipoib_peer_cache[ key % IPOIB_NUM_CACHED_PEERS ]; if ( peer->key ) DBG ( "IPoIB peer %x evicted from cache\n", peer->key ); memset ( peer, 0, sizeof ( *peer ) ); peer->key = key; memcpy ( &peer->mac, mac, sizeof ( peer->mac ) ); DBG ( "IPoIB peer %x has MAC %s\n", peer->key, ipoib_ntoa ( &peer->mac ) ); return peer; } /**************************************************************************** * * IPoIB link layer * **************************************************************************** */ /** * Add IPoIB link-layer header * * @v netdev Network device * @v iobuf I/O buffer * @v ll_dest Link-layer destination address * @v ll_source Source link-layer address * @v net_proto Network-layer protocol, in network-byte order * @ret rc Return status code */ static int ipoib_push ( struct net_device *netdev __unused, struct io_buffer *iobuf, const void *ll_dest, const void *ll_source __unused, uint16_t net_proto ) { struct ipoib_hdr *ipoib_hdr = iob_push ( iobuf, sizeof ( *ipoib_hdr ) ); const struct ipoib_mac *dest_mac = ll_dest; const struct ipoib_mac *src_mac = ll_source; struct ipoib_peer *dest; struct ipoib_peer *src; /* Add link-layer addresses to cache */ dest = ipoib_cache_peer ( dest_mac ); src = ipoib_cache_peer ( src_mac ); /* Build IPoIB header */ ipoib_hdr->proto = net_proto; ipoib_hdr->u.peer.dest = dest->key; ipoib_hdr->u.peer.src = src->key; return 0; } /** * Remove IPoIB link-layer header * * @v netdev Network device * @v iobuf I/O buffer * @ret ll_dest Link-layer destination address * @ret ll_source Source link-layer address * @ret net_proto Network-layer protocol, in network-byte order * @ret rc Return status code */ static int ipoib_pull ( struct net_device *netdev, struct io_buffer *iobuf, const void **ll_dest, const void **ll_source, uint16_t *net_proto ) { struct ipoib_device *ipoib = netdev->priv; struct ipoib_hdr *ipoib_hdr = iobuf->data; struct ipoib_peer *dest; struct ipoib_peer *source; /* Sanity check */ if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) { DBG ( "IPoIB packet too short for link-layer header\n" ); DBG_HD ( iobuf->data, iob_len ( iobuf ) ); return -EINVAL; } /* Strip off IPoIB header */ iob_pull ( iobuf, sizeof ( *ipoib_hdr ) ); /* Identify source and destination addresses, and clear * reserved word in IPoIB header */ dest = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.dest ); source = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.src ); ipoib_hdr->u.reserved = 0; /* Fill in required fields */ *ll_dest = ( dest ? &dest->mac : &ipoib->broadcast ); *ll_source = ( source ? &source->mac : &ipoib->broadcast ); *net_proto = ipoib_hdr->proto; return 0; } /** * Initialise IPoIB link-layer address * * @v hw_addr Hardware address * @v ll_addr Link-layer address */ static void ipoib_init_addr ( const void *hw_addr, void *ll_addr ) { const struct ib_gid_half *guid = hw_addr; struct ipoib_mac *mac = ll_addr; memset ( mac, 0, sizeof ( *mac ) ); memcpy ( &mac->gid.u.half[1], guid, sizeof ( mac->gid.u.half[1] ) ); } /** * Transcribe IPoIB link-layer address * * @v ll_addr Link-layer address * @ret string Link-layer address in human-readable format */ const char * ipoib_ntoa ( const void *ll_addr ) { static char buf[45]; const struct ipoib_mac *mac = ll_addr; snprintf ( buf, sizeof ( buf ), "%08x:%08x:%08x:%08x:%08x", htonl ( mac->flags__qpn ), htonl ( mac->gid.u.dwords[0] ), htonl ( mac->gid.u.dwords[1] ), htonl ( mac->gid.u.dwords[2] ), htonl ( mac->gid.u.dwords[3] ) ); return buf; } /** * Hash multicast address * * @v af Address family * @v net_addr Network-layer address * @v ll_addr Link-layer address to fill in * @ret rc Return status code */ static int ipoib_mc_hash ( unsigned int af __unused, const void *net_addr __unused, void *ll_addr __unused ) { return -ENOTSUP; } /** * Generate Mellanox Ethernet-compatible compressed link-layer address * * @v ll_addr Link-layer address * @v eth_addr Ethernet-compatible address to fill in */ static int ipoib_mlx_eth_addr ( const struct ib_gid_half *guid, uint8_t *eth_addr ) { eth_addr[0] = ( ( guid->u.bytes[3] == 2 ) ? 0x00 : 0x02 ); eth_addr[1] = guid->u.bytes[1]; eth_addr[2] = guid->u.bytes[2]; eth_addr[3] = guid->u.bytes[5]; eth_addr[4] = guid->u.bytes[6]; eth_addr[5] = guid->u.bytes[7]; return 0; } /** An IPoIB Ethernet-compatible compressed link-layer address generator */ struct ipoib_eth_addr_handler { /** GUID byte 1 */ uint8_t byte1; /** GUID byte 2 */ uint8_t byte2; /** Handler */ int ( * eth_addr ) ( const struct ib_gid_half *guid, uint8_t *eth_addr ); }; /** IPoIB Ethernet-compatible compressed link-layer address generators */ static struct ipoib_eth_addr_handler ipoib_eth_addr_handlers[] = { { 0x02, 0xc9, ipoib_mlx_eth_addr }, }; /** * Generate Ethernet-compatible compressed link-layer address * * @v ll_addr Link-layer address * @v eth_addr Ethernet-compatible address to fill in */ static int ipoib_eth_addr ( const void *ll_addr, void *eth_addr ) { const struct ipoib_mac *ipoib_addr = ll_addr; const struct ib_gid_half *guid = &ipoib_addr->gid.u.half[1]; struct ipoib_eth_addr_handler *handler; unsigned int i; for ( i = 0 ; i < ( sizeof ( ipoib_eth_addr_handlers ) / sizeof ( ipoib_eth_addr_handlers[0] ) ) ; i++ ) { handler = &ipoib_eth_addr_handlers[i]; if ( ( handler->byte1 == guid->u.bytes[1] ) && ( handler->byte2 == guid->u.bytes[2] ) ) { return handler->eth_addr ( guid, eth_addr ); } } return -ENOTSUP; } /** IPoIB protocol */ struct ll_protocol ipoib_protocol __ll_protocol = { .name = "IPoIB", .ll_proto = htons ( ARPHRD_INFINIBAND ), .hw_addr_len = sizeof ( struct ib_gid_half ), .ll_addr_len = IPOIB_ALEN, .ll_header_len = IPOIB_HLEN, .push = ipoib_push, .pull = ipoib_pull, .init_addr = ipoib_init_addr, .ntoa = ipoib_ntoa, .mc_hash = ipoib_mc_hash, .eth_addr = ipoib_eth_addr, }; /** * Allocate IPoIB device * * @v priv_size Size of driver private data * @ret netdev Network device, or NULL */ struct net_device * alloc_ipoibdev ( size_t priv_size ) { struct net_device *netdev; netdev = alloc_netdev ( priv_size ); if ( netdev ) { netdev->ll_protocol = &ipoib_protocol; netdev->ll_broadcast = ( uint8_t * ) &ipoib_broadcast; netdev->max_pkt_len = IB_MAX_PAYLOAD_SIZE; } return netdev; } /**************************************************************************** * * IPoIB network device * **************************************************************************** */ /** * Transmit packet via IPoIB network device * * @v netdev Network device * @v iobuf I/O buffer * @ret rc Return status code */ static int ipoib_transmit ( struct net_device *netdev, struct io_buffer *iobuf ) { struct ipoib_device *ipoib = netdev->priv; struct ib_device *ibdev = ipoib->ibdev; struct ipoib_hdr *ipoib_hdr; struct ipoib_peer *dest; struct ib_address_vector av; int rc; /* Sanity check */ if ( iob_len ( iobuf ) < sizeof ( *ipoib_hdr ) ) { DBGC ( ipoib, "IPoIB %p buffer too short\n", ipoib ); return -EINVAL; } ipoib_hdr = iobuf->data; /* Attempting transmission while link is down will put the * queue pair into an error state, so don't try it. */ if ( ! ib_link_ok ( ibdev ) ) return -ENETUNREACH; /* Identify destination address */ dest = ipoib_lookup_peer_by_key ( ipoib_hdr->u.peer.dest ); if ( ! dest ) return -ENXIO; ipoib_hdr->u.reserved = 0; /* Construct address vector */ memset ( &av, 0, sizeof ( av ) ); av.qpn = ( ntohl ( dest->mac.flags__qpn ) & IB_QPN_MASK ); av.gid_present = 1; memcpy ( &av.gid, &dest->mac.gid, sizeof ( av.gid ) ); if ( ( rc = ib_resolve_path ( ibdev, &av ) ) != 0 ) { /* Path not resolved yet */ return rc; } return ib_post_send ( ibdev, ipoib->qp, &av, iobuf ); } /** * Handle IPoIB send completion * * @v ibdev Infiniband device * @v qp Queue pair * @v iobuf I/O buffer * @v rc Completion status code */ static void ipoib_complete_send ( struct ib_device *ibdev __unused, struct ib_queue_pair *qp, struct io_buffer *iobuf, int rc ) { struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp ); netdev_tx_complete_err ( ipoib->netdev, iobuf, rc ); } /** * Handle IPoIB receive completion * * @v ibdev Infiniband device * @v qp Queue pair * @v av Address vector, or NULL * @v iobuf I/O buffer * @v rc Completion status code */ static void ipoib_complete_recv ( struct ib_device *ibdev __unused, struct ib_queue_pair *qp, struct ib_address_vector *av, struct io_buffer *iobuf, int rc ) { struct ipoib_device *ipoib = ib_qp_get_ownerdata ( qp ); struct net_device *netdev = ipoib->netdev; struct ipoib_hdr *ipoib_hdr; struct ipoib_mac ll_src; struct ipoib_peer *src; if ( rc != 0 ) { netdev_rx_err ( netdev, iobuf, rc ); return; } /* Sanity check */ if ( iob_len ( iobuf ) < sizeof ( struct ipoib_hdr ) ) { DBGC ( ipoib, "IPoIB %p received packet too short to " "contain IPoIB header\n", ipoib ); DBGC_HD ( ipoib, iobuf->data, iob_len ( iobuf ) ); netdev_rx_err ( netdev, iobuf, -EIO ); return; } ipoib_hdr = iobuf->data; /* Parse source address */ if ( av->gid_present ) { ll_src.flags__qpn = htonl ( av->qpn ); memcpy ( &ll_src.gid, &av->gid, sizeof ( ll_src.gid ) ); src = ipoib_cache_peer ( &ll_src ); ipoib_hdr->u.peer.src = src->key; } /* Hand off to network layer */ netdev_rx ( netdev, iobuf ); } /** IPoIB completion operations */ static struct ib_completion_queue_operations ipoib_cq_op = { .complete_send = ipoib_complete_send, .complete_recv = ipoib_complete_recv, }; /** * Poll IPoIB network device * * @v netdev Network device */ static void ipoib_poll ( struct net_device *netdev ) { struct ipoib_device *ipoib = netdev->priv; struct ib_device *ibdev = ipoib->ibdev; ib_poll_eq ( ibdev ); } /** * Enable/disable interrupts on IPoIB network device * * @v netdev Network device * @v enable Interrupts should be enabled */ static void ipoib_irq ( struct net_device *netdev __unused, int enable __unused ) { /* No implementation */ } /** * Handle IPv4 broadcast multicast group join completion * * @v ibdev Infiniband device * @v qp Queue pair * @v membership Multicast group membership * @v rc Status code * @v mad Response MAD (or NULL on error) */ void ipoib_join_complete ( struct ib_device *ibdev __unused, struct ib_queue_pair *qp __unused, struct ib_mc_membership *membership, int rc, union ib_mad *mad __unused ) { struct ipoib_device *ipoib = container_of ( membership, struct ipoib_device, broadcast_membership ); /* Record join status as link status */ netdev_link_err ( ipoib->netdev, rc ); } /** * Join IPv4 broadcast multicast group * * @v ipoib IPoIB device * @ret rc Return status code */ static int ipoib_join_broadcast_group ( struct ipoib_device *ipoib ) { int rc; if ( ( rc = ib_mcast_join ( ipoib->ibdev, ipoib->qp, &ipoib->broadcast_membership, &ipoib->broadcast.gid, ipoib_join_complete ) ) != 0 ) { DBGC ( ipoib, "IPoIB %p could not join broadcast group: %s\n", ipoib, strerror ( rc ) ); return rc; } ipoib->broadcast_joined = 1; return 0; } /** * Leave IPv4 broadcast multicast group * * @v ipoib IPoIB device */ static void ipoib_leave_broadcast_group ( struct ipoib_device *ipoib ) { if ( ipoib->broadcast_joined ) { ib_mcast_leave ( ipoib->ibdev, ipoib->qp, &ipoib->broadcast_membership ); ipoib->broadcast_joined = 0; } } /** * Open IPoIB network device * * @v netdev Network device * @ret rc Return status code */ static int ipoib_open ( struct net_device *netdev ) { struct ipoib_device *ipoib = netdev->priv; struct ib_device *ibdev = ipoib->ibdev; struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr ); int rc; /* Open IB device */ if ( ( rc = ib_open ( ibdev ) ) != 0 ) { DBGC ( ipoib, "IPoIB %p could not open device: %s\n", ipoib, strerror ( rc ) ); goto err_ib_open; } /* Allocate completion queue */ ipoib->cq = ib_create_cq ( ibdev, IPOIB_NUM_CQES, &ipoib_cq_op ); if ( ! ipoib->cq ) { DBGC ( ipoib, "IPoIB %p could not allocate completion queue\n", ipoib ); rc = -ENOMEM; goto err_create_cq; } /* Allocate queue pair */ ipoib->qp = ib_create_qp ( ibdev, IB_QPT_UD, IPOIB_NUM_SEND_WQES, ipoib->cq, IPOIB_NUM_RECV_WQES, ipoib->cq ); if ( ! ipoib->qp ) { DBGC ( ipoib, "IPoIB %p could not allocate queue pair\n", ipoib ); rc = -ENOMEM; goto err_create_qp; } ib_qp_set_ownerdata ( ipoib->qp, ipoib ); /* Update MAC address with QPN */ mac->flags__qpn = htonl ( ipoib->qp->qpn ); /* Fill receive rings */ ib_refill_recv ( ibdev, ipoib->qp ); /* Fake a link status change to join the broadcast group */ ipoib_link_state_changed ( ibdev ); return 0; ib_destroy_qp ( ibdev, ipoib->qp ); err_create_qp: ib_destroy_cq ( ibdev, ipoib->cq ); err_create_cq: ib_close ( ibdev ); err_ib_open: return rc; } /** * Close IPoIB network device * * @v netdev Network device */ static void ipoib_close ( struct net_device *netdev ) { struct ipoib_device *ipoib = netdev->priv; struct ib_device *ibdev = ipoib->ibdev; struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr ); /* Leave broadcast group */ ipoib_leave_broadcast_group ( ipoib ); /* Remove QPN from MAC address */ mac->flags__qpn = 0; /* Tear down the queues */ ib_destroy_qp ( ibdev, ipoib->qp ); ib_destroy_cq ( ibdev, ipoib->cq ); /* Close IB device */ ib_close ( ibdev ); } /** IPoIB network device operations */ static struct net_device_operations ipoib_operations = { .open = ipoib_open, .close = ipoib_close, .transmit = ipoib_transmit, .poll = ipoib_poll, .irq = ipoib_irq, }; /** * Handle link status change * * @v ibdev Infiniband device */ void ipoib_link_state_changed ( struct ib_device *ibdev ) { struct net_device *netdev = ib_get_ownerdata ( ibdev ); struct ipoib_device *ipoib = netdev->priv; struct ipoib_mac *mac = ( ( struct ipoib_mac * ) netdev->ll_addr ); int rc; /* Leave existing broadcast group */ ipoib_leave_broadcast_group ( ipoib ); /* Update MAC address based on potentially-new GID prefix */ memcpy ( &mac->gid.u.half[0], &ibdev->gid.u.half[0], sizeof ( mac->gid.u.half[0] ) ); /* Update broadcast GID based on potentially-new partition key */ ipoib->broadcast.gid.u.words[2] = htons ( ibdev->pkey | IB_PKEY_FULL ); /* Set net device link state to reflect Infiniband link state */ rc = ib_link_rc ( ibdev ); netdev_link_err ( netdev, ( rc ? rc : -EINPROGRESS_JOINING ) ); /* Join new broadcast group */ if ( ib_link_ok ( ibdev ) && ( ( rc = ipoib_join_broadcast_group ( ipoib ) ) != 0 ) ) { DBGC ( ipoib, "IPoIB %p could not rejoin broadcast group: " "%s\n", ipoib, strerror ( rc ) ); netdev_link_err ( netdev, rc ); return; } } /** * Probe IPoIB device * * @v ibdev Infiniband device * @ret rc Return status code */ int ipoib_probe ( struct ib_device *ibdev ) { struct net_device *netdev; struct ipoib_device *ipoib; int rc; /* Allocate network device */ netdev = alloc_ipoibdev ( sizeof ( *ipoib ) ); if ( ! netdev ) return -ENOMEM; netdev_init ( netdev, &ipoib_operations ); ipoib = netdev->priv; ib_set_ownerdata ( ibdev, netdev ); netdev->dev = ibdev->dev; memset ( ipoib, 0, sizeof ( *ipoib ) ); ipoib->netdev = netdev; ipoib->ibdev = ibdev; /* Extract hardware address */ memcpy ( netdev->hw_addr, &ibdev->gid.u.half[1], sizeof ( ibdev->gid.u.half[1] ) ); /* Set default broadcast address */ memcpy ( &ipoib->broadcast, &ipoib_broadcast, sizeof ( ipoib->broadcast ) ); netdev->ll_broadcast = ( ( uint8_t * ) &ipoib->broadcast ); /* Register network device */ if ( ( rc = register_netdev ( netdev ) ) != 0 ) goto err_register_netdev; return 0; err_register_netdev: netdev_nullify ( netdev ); netdev_put ( netdev ); return rc; } /** * Remove IPoIB device * * @v ibdev Infiniband device */ void ipoib_remove ( struct ib_device *ibdev ) { struct net_device *netdev = ib_get_ownerdata ( ibdev ); unregister_netdev ( netdev ); netdev_nullify ( netdev ); netdev_put ( netdev ); }