[xen] Support scatter-gather to allow for jumbo frames

The use of jumbo frames for the Xen netfront virtual NIC requires the
use of scatter-gather ("feature-sg"), with the receive descriptor ring
becoming a list of page-sized buffers and the backend using as many
page buffers as required for each packet.

Since iPXE's abstraction of an I/O buffer does not include any sort of
scatter-gather list, this requires an extra allocation and copy on the
receive datapath for any packet that spans more than a single page.

This support is required in order to successfully boot an AWS EC2
virtual machine (with non-enhanced networking) via iSCSI if jumbo
frames are enabled, since the netback driver used in EC2 seems not to
allow "feature-sg" to be renegotiated once the Linux kernel driver
takes over.

Signed-off-by: Michael Brown <mcb30@ipxe.org>
pull/329/head
Michael Brown 2021-04-14 16:33:41 +01:00
parent 8ca43ccbc1
commit 85d179f2c6
3 changed files with 154 additions and 57 deletions

View File

@ -56,7 +56,7 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
__einfo_uniqify ( EINFO_EIO, -NETIF_RSP_DROPPED, \
"Packet dropped" )
#define EIO_NETIF_RSP( status ) \
EUNIQ ( EINFO_EIO, -(status), \
EUNIQ ( EINFO_EIO, ( -(status) & 0x1f ), \
EIO_NETIF_RSP_ERROR, EIO_NETIF_RSP_DROPPED )
/******************************************************************************
@ -326,6 +326,7 @@ static int netfront_create_ring ( struct netfront_nic *netfront,
struct netfront_ring *ring ) {
struct xen_device *xendev = netfront->xendev;
struct xen_hypervisor *xen = xendev->xen;
physaddr_t addr;
unsigned int i;
int rc;
@ -345,11 +346,11 @@ static int netfront_create_ring ( struct netfront_nic *netfront,
}
/* Grant access to shared ring */
addr = virt_to_phys ( ring->sring.raw );
if ( ( rc = xengrant_permit_access ( xen, ring->ref, xendev->backend_id,
0, ring->sring.raw ) ) != 0 ) {
0, addr ) ) != 0 ) {
DBGC ( netfront, "NETFRONT %s could not permit access to "
"%#08lx: %s\n", xendev->key,
virt_to_phys ( ring->sring.raw ), strerror ( rc ) );
"%#08lx: %s\n", xendev->key, addr, strerror ( rc ) );
goto err_permit_access;
}
@ -358,10 +359,8 @@ static int netfront_create_ring ( struct netfront_nic *netfront,
ring->ref ) ) != 0 )
goto err_write_num;
DBGC ( netfront, "NETFRONT %s %s=\"%d\" [%08lx,%08lx)\n",
xendev->key, ring->ref_key, ring->ref,
virt_to_phys ( ring->sring.raw ),
( virt_to_phys ( ring->sring.raw ) + PAGE_SIZE ) );
DBGC ( netfront, "NETFRONT %s %s=\"%d\" [%08lx,%08lx)\n", xendev->key,
ring->ref_key, ring->ref, addr, ( addr + PAGE_SIZE ) );
return 0;
netfront_rm ( netfront, ring->ref_key );
@ -378,7 +377,8 @@ static int netfront_create_ring ( struct netfront_nic *netfront,
*
* @v netfront Netfront device
* @v ring Descriptor ring
* @v iobuf I/O buffer
* @v addr Physical address
* @v iobuf Associated I/O buffer, or NULL
* @v id Buffer ID to fill in
* @v ref Grant reference to fill in
* @ret rc Return status code
@ -387,8 +387,9 @@ static int netfront_create_ring ( struct netfront_nic *netfront,
* ring.
*/
static int netfront_push ( struct netfront_nic *netfront,
struct netfront_ring *ring, struct io_buffer *iobuf,
uint16_t *id, grant_ref_t *ref ) {
struct netfront_ring *ring, physaddr_t addr,
struct io_buffer *iobuf, uint16_t *id,
grant_ref_t *ref ) {
struct xen_device *xendev = netfront->xendev;
struct xen_hypervisor *xen = xendev->xen;
unsigned int next_id;
@ -402,19 +403,15 @@ static int netfront_push ( struct netfront_nic *netfront,
next_id = ring->ids[ ring->id_prod & ( ring->count - 1 ) ];
next_ref = ring->refs[next_id];
/* Grant access to I/O buffer page. I/O buffers are naturally
* aligned, so we never need to worry about crossing a page
* boundary.
*/
/* Grant access to page containing address */
if ( ( rc = xengrant_permit_access ( xen, next_ref, xendev->backend_id,
0, iobuf->data ) ) != 0 ) {
0, addr ) ) != 0 ) {
DBGC ( netfront, "NETFRONT %s could not permit access to "
"%#08lx: %s\n", xendev->key,
virt_to_phys ( iobuf->data ), strerror ( rc ) );
"%#08lx: %s\n", xendev->key, addr, strerror ( rc ) );
return rc;
}
/* Store I/O buffer */
/* Store associated I/O buffer, if any */
assert ( ring->iobufs[next_id] == NULL );
ring->iobufs[next_id] = iobuf;
@ -434,7 +431,7 @@ static int netfront_push ( struct netfront_nic *netfront,
* @v netfront Netfront device
* @v ring Descriptor ring
* @v id Buffer ID
* @ret iobuf I/O buffer
* @ret iobuf Associated I/O buffer, if any
*/
static struct io_buffer * netfront_pull ( struct netfront_nic *netfront,
struct netfront_ring *ring,
@ -451,7 +448,6 @@ static struct io_buffer * netfront_pull ( struct netfront_nic *netfront,
/* Retrieve I/O buffer */
iobuf = ring->iobufs[id];
assert ( iobuf != NULL );
ring->iobufs[id] = NULL;
/* Free buffer ID */
@ -494,6 +490,22 @@ static void netfront_destroy_ring ( struct netfront_nic *netfront,
ring->sring.raw = NULL;
}
/**
* Discard partially received I/O buffers
*
* @v netfront Netfront device
*/
static void netfront_discard ( struct netfront_nic *netfront ) {
struct io_buffer *iobuf;
struct io_buffer *tmp;
/* Discard all buffers in the list */
list_for_each_entry_safe ( iobuf, tmp, &netfront->rx_partial, list ) {
list_del ( &iobuf->list );
free_iob ( iobuf );
}
}
/******************************************************************************
*
* Network device interface
@ -512,6 +524,7 @@ static void netfront_refill_rx ( struct net_device *netdev ) {
struct io_buffer *iobuf;
struct netif_rx_request *request;
unsigned int refilled = 0;
physaddr_t addr;
int notify;
int rc;
@ -524,24 +537,24 @@ static void netfront_refill_rx ( struct net_device *netdev ) {
/* Wait for next refill */
break;
}
addr = virt_to_phys ( iobuf->data );
/* Add to descriptor ring */
request = RING_GET_REQUEST ( &netfront->rx_fring,
netfront->rx_fring.req_prod_pvt );
if ( ( rc = netfront_push ( netfront, &netfront->rx,
if ( ( rc = netfront_push ( netfront, &netfront->rx, addr,
iobuf, &request->id,
&request->gref ) ) != 0 ) {
netdev_rx_err ( netdev, iobuf, rc );
break;
}
DBGC2 ( netfront, "NETFRONT %s RX id %d ref %d is %#08lx+%zx\n",
xendev->key, request->id, request->gref,
virt_to_phys ( iobuf->data ), iob_tailroom ( iobuf ) );
xendev->key, request->id, request->gref, addr,
iob_tailroom ( iobuf ) );
/* Move to next descriptor */
netfront->rx_fring.req_prod_pvt++;
refilled++;
}
/* Push new descriptors and notify backend if applicable */
@ -593,6 +606,10 @@ static int netfront_open ( struct net_device *netdev ) {
if ( ( rc = netfront_write_flag ( netfront, "request-rx-copy" ) ) != 0 )
goto err_request_rx_copy;
/* Inform backend that we can support scatter-gather */
if ( ( rc = netfront_write_flag ( netfront, "feature-sg" ) ) != 0 )
goto err_feature_sg;
/* Disable checksum offload, since we will always do the work anyway */
if ( ( rc = netfront_write_flag ( netfront,
"feature-no-csum-offload" ) ) != 0 )
@ -632,6 +649,8 @@ static int netfront_open ( struct net_device *netdev ) {
err_feature_rx_notify:
netfront_rm ( netfront, "feature-no-csum-offload" );
err_feature_no_csum_offload:
netfront_rm ( netfront, "feature-sg" );
err_feature_sg:
netfront_rm ( netfront, "request-rx-copy" );
err_request_rx_copy:
netfront_destroy_event ( netfront );
@ -675,11 +694,15 @@ static void netfront_close ( struct net_device *netdev ) {
/* Delete flags */
netfront_rm ( netfront, "feature-rx-notify" );
netfront_rm ( netfront, "feature-no-csum-offload" );
netfront_rm ( netfront, "feature-sg" );
netfront_rm ( netfront, "request-rx-copy" );
/* Destroy event channel */
netfront_destroy_event ( netfront );
/* Discard any partially received I/O buffers */
netfront_discard ( netfront );
/* Destroy receive descriptor ring, freeing any outstanding
* I/O buffers.
*/
@ -703,34 +726,66 @@ static int netfront_transmit ( struct net_device *netdev,
struct netfront_nic *netfront = netdev->priv;
struct xen_device *xendev = netfront->xendev;
struct netif_tx_request *request;
physaddr_t addr;
size_t len;
size_t remaining;
size_t frag_len;
unsigned int offset;
unsigned int count;
unsigned int more;
int notify;
int rc;
/* Calculate number of page buffers required */
addr = virt_to_phys ( iobuf->data );
len = iob_len ( iobuf );
offset = ( addr & ( PAGE_SIZE - 1 ) );
count = ( ( offset + len + PAGE_SIZE - 1 ) / PAGE_SIZE );
/* Check that we have space in the ring */
if ( netfront_ring_is_full ( &netfront->tx ) ) {
if ( netfront_ring_space ( &netfront->tx ) < count ) {
DBGC ( netfront, "NETFRONT %s out of transmit descriptors\n",
xendev->key );
return -ENOBUFS;
}
/* Add to descriptor ring */
request = RING_GET_REQUEST ( &netfront->tx_fring,
netfront->tx_fring.req_prod_pvt );
if ( ( rc = netfront_push ( netfront, &netfront->tx, iobuf,
&request->id, &request->gref ) ) != 0 ) {
return rc;
remaining = len;
while ( remaining ) {
/* Calculate length of this fragment */
frag_len = ( PAGE_SIZE - offset );
if ( frag_len >= remaining ) {
frag_len = remaining;
more = 0;
} else {
more = NETTXF_more_data;
}
/* Populate request */
request = RING_GET_REQUEST ( &netfront->tx_fring,
netfront->tx_fring.req_prod_pvt );
if ( ( rc = netfront_push ( netfront, &netfront->tx, addr,
( more ? NULL : iobuf ),
&request->id,
&request->gref ) ) != 0 ) {
return rc;
}
request->flags = ( NETTXF_data_validated | more );
request->offset = offset;
request->size = ( ( remaining == len ) ? len : frag_len );
DBGC2 ( netfront, "NETFRONT %s TX id %d ref %d is "
"%#08lx+%zx%s\n", xendev->key, request->id,
request->gref, addr, frag_len, ( more ? "..." : "" ) );
/* Move to next descriptor */
netfront->tx_fring.req_prod_pvt++;
addr += frag_len;
remaining -= frag_len;
offset = 0;
}
request->offset = ( virt_to_phys ( iobuf->data ) & ( PAGE_SIZE - 1 ) );
request->flags = NETTXF_data_validated;
request->size = iob_len ( iobuf );
DBGC2 ( netfront, "NETFRONT %s TX id %d ref %d is %#08lx+%zx\n",
xendev->key, request->id, request->gref,
virt_to_phys ( iobuf->data ), iob_len ( iobuf ) );
/* Consume descriptor */
netfront->tx_fring.req_prod_pvt++;
/* Push new descriptor and notify backend if applicable */
/* Push new descriptors and notify backend if applicable */
RING_PUSH_REQUESTS_AND_CHECK_NOTIFY ( &netfront->tx_fring, notify );
if ( notify )
netfront_send_event ( netfront );
@ -748,7 +803,7 @@ static void netfront_poll_tx ( struct net_device *netdev ) {
struct xen_device *xendev = netfront->xendev;
struct netif_tx_response *response;
struct io_buffer *iobuf;
unsigned int status;
int status;
int rc;
/* Consume any unconsumed responses */
@ -761,10 +816,11 @@ static void netfront_poll_tx ( struct net_device *netdev ) {
/* Retrieve from descriptor ring */
iobuf = netfront_pull ( netfront, &netfront->tx, response->id );
status = response->status;
if ( status == NETIF_RSP_OKAY ) {
if ( status >= NETIF_RSP_OKAY ) {
DBGC2 ( netfront, "NETFRONT %s TX id %d complete\n",
xendev->key, response->id );
netdev_tx_complete ( netdev, iobuf );
if ( iobuf )
netdev_tx_complete ( netdev, iobuf );
} else {
rc = -EIO_NETIF_RSP ( status );
DBGC2 ( netfront, "NETFRONT %s TX id %d error %d: %s\n",
@ -786,6 +842,7 @@ static void netfront_poll_rx ( struct net_device *netdev ) {
struct netif_rx_response *response;
struct io_buffer *iobuf;
int status;
int more;
size_t len;
int rc;
@ -799,21 +856,45 @@ static void netfront_poll_rx ( struct net_device *netdev ) {
/* Retrieve from descriptor ring */
iobuf = netfront_pull ( netfront, &netfront->rx, response->id );
status = response->status;
if ( status >= 0 ) {
len = status;
iob_reserve ( iobuf, response->offset );
iob_put ( iobuf, len );
DBGC2 ( netfront, "NETFRONT %s RX id %d complete "
"%#08lx+%zx\n", xendev->key, response->id,
virt_to_phys ( iobuf->data ), len );
netdev_rx ( netdev, iobuf );
} else {
more = ( response->flags & NETRXF_more_data );
/* Report errors */
if ( status < 0 ) {
rc = -EIO_NETIF_RSP ( status );
DBGC2 ( netfront, "NETFRONT %s RX id %d error %d: %s\n",
xendev->key, response->id, status,
strerror ( rc ) );
netfront_discard ( netfront );
netdev_rx_err ( netdev, iobuf, rc );
continue;
}
/* Add to partial receive list */
len = status;
iob_reserve ( iobuf, response->offset );
iob_put ( iobuf, len );
DBGC2 ( netfront, "NETFRONT %s RX id %d complete "
"%#08lx+%zx%s\n", xendev->key, response->id,
virt_to_phys ( iobuf->data ), len,
( more ? "..." : "" ) );
list_add_tail ( &iobuf->list, &netfront->rx_partial );
/* Wait until complete packet has been received */
if ( more )
continue;
/* Reassemble complete packet */
iobuf = iob_concatenate ( &netfront->rx_partial );
if ( ! iobuf ) {
DBGC2 ( netfront, "NETFRONT %s RX reassembly failed\n",
xendev->key );
netfront_discard ( netfront );
netdev_rx_err ( netdev, NULL, -ENOMEM );
continue;
}
/* Hand off to network stack */
netdev_rx ( netdev, iobuf );
}
}
@ -871,6 +952,7 @@ static int netfront_probe ( struct xen_device *xendev ) {
netdev->dev = &xendev->dev;
netfront = netdev->priv;
netfront->xendev = xendev;
INIT_LIST_HEAD ( &netfront->rx_partial );
DBGC ( netfront, "NETFRONT %s backend=\"%s\" in domain %ld\n",
xendev->key, xendev->backend, xendev->backend_id );

View File

@ -65,7 +65,7 @@ struct netfront_ring {
size_t count;
/** I/O buffers, indexed by buffer ID */
struct io_buffer **iobufs;
/** I/O buffer grant references, indexed by buffer ID */
/** Grant references, indexed by buffer ID */
grant_ref_t *refs;
/** Buffer ID ring */
@ -116,6 +116,18 @@ netfront_ring_fill ( struct netfront_ring *ring ) {
return fill_level;
}
/**
* Calculate descriptor ring remaining space
*
* @v ring Descriptor ring
* @v space Number of unused entries
*/
static inline __attribute__ (( always_inline )) unsigned int
netfront_ring_space ( struct netfront_ring *ring ) {
return ( ring->count - netfront_ring_fill ( ring ) );
}
/**
* Check whether or not descriptor ring is full
*
@ -164,6 +176,8 @@ struct netfront_nic {
struct io_buffer *rx_iobufs[NETFRONT_NUM_RX_DESC];
/** Receive I/O buffer IDs */
uint8_t rx_ids[NETFRONT_NUM_RX_DESC];
/** Partial receive I/O buffer list */
struct list_head rx_partial;
/** Event channel */
struct evtchn_send event;

View File

@ -166,16 +166,17 @@ xengrant_invalidate ( struct xen_hypervisor *xen, grant_ref_t ref ) {
* @v ref Grant reference
* @v domid Domain ID
* @v subflags Additional flags
* @v page Page start
* @v addr Physical address within page
* @ret rc Return status code
*/
static inline __attribute__ (( always_inline )) int
xengrant_permit_access ( struct xen_hypervisor *xen, grant_ref_t ref,
domid_t domid, unsigned int subflags, void *page ) {
domid_t domid, unsigned int subflags,
physaddr_t addr ) {
struct grant_entry_header *hdr = xengrant_header ( xen, ref );
struct grant_entry_v1 *v1 = xengrant_v1 ( hdr );
union grant_entry_v2 *v2 = xengrant_v2 ( hdr );
unsigned long frame = ( virt_to_phys ( page ) / PAGE_SIZE );
unsigned long frame = ( addr / PAGE_SIZE );
/* Fail (for test purposes) if applicable */
if ( ( XENGRANT_FAIL_RATE > 0 ) &&