/*
 * Copyright (c) 2007,2008 Shakir James, John DeHart and Washington University in St. Louis.
 * All rights reserved
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *    1. Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *    2. Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *    3. The name of the author or Washington University may not be used 
 *       to endorse or promote products derived from this source code 
 *       without specific prior written permission.
 *    4. Conditions of any other entities that contributed to this are also
 *       met. If a copyright notice is present from another entity, it must
 *       be maintained in redistributions of the source code.
 *
 * THIS INTELLECTUAL PROPERTY (WHICH MAY INCLUDE BUT IS NOT LIMITED TO SOFTWARE,
 * FIRMWARE, VHDL, etc) IS PROVIDED BY THE AUTHOR AND WASHINGTON UNIVERSITY 
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR WASHINGTON UNIVERSITY 
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
 * ARISING IN ANY WAY OUT OF THE USE OF THIS INTELLECTUAL PROPERTY, EVEN IF 
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * */

/*
 * File:   inline_plugin_api.h (formerly plugin_api.c and plugin_api.h)
 * Author: Shakir James, John DeHart
 * Email:  scj1@arl.wustl.edu, jdd@arl.wustl.edu
 * Organization: Applied Research Laboratory
 * 
 * Derived from: NONE
 *
 * Date Created:  8/15/2007
 * 
 * Description:  Plugin API
 *
 * Modification History: 
 *
 * 03/07/08:  JDD : Added fcts to read/write IP, TCP, UDP headers and buffer descriptors.
 *                  Also added unaligned dram writes for 8B and 20B
 * 03/17/08:  JDD: Combined files and use __forceinline on all functions to help the
 *                 compiler and optimizer do a better job.
 *
 */ 

#ifndef _INLINE_PLUGIN_API_C
#define _INLINE_PLUGIN_API_C


// standard includes for ONL router
#include "dl_system.h"
#include <system_init.h>
#include <ixp_lib.h>
#include <dl_buf.c>
#include "ring_formats.h"
#include "counter_util.h" // system-wide counters
#include "string.h"

#include "plugin_dl.h"

//-------------------------------------------------------------------
// Extern
//-------------------------------------------------------------------
extern __declspec(gp_reg) unsigned int pluginId; 
extern __declspec(gp_reg) int dlNextBlock;
// see ring_formats.h for struct definitions
extern volatile __declspec(gp_reg) plc_plugin_data ring_in;  // ring data from PLC
extern volatile __declspec(gp_reg) plugin_out_data ring_out; // ring data to next block

//-------------------------------------------------------------------
// Constants
//-------------------------------------------------------------------
//#define ETH_PAYLOAD_DRAM_OFFSET     0x18E

#define PROTO_ICMP  1
#define PROTO_TCP   6
#define PROTO_UDP   17


//-------------------------------------------------------------------
// Types
//-------------------------------------------------------------------

// List of new fcts to add:

typedef __declspec(packed) union onl_api_u_ip_hdr
{
    struct
    {
        // Word0:
        unsigned int ip_v:     4;
        unsigned int ip_hl:    4;
	unsigned int ip_tos:   8;
	unsigned int ip_len:  16;

        // Word1:
	unsigned int ip_id:   16;
	unsigned int ip_off:  16;

        // Word2:
	unsigned int ip_ttl:   8;
	unsigned int ip_proto: 8;
	unsigned int ip_sum:  16;

        // Word3:
	unsigned int ip_src:  32;

        // Word4:
	unsigned int ip_dst:  32;
    };
    unsigned int value[5];
} onl_api_ip_hdr;

// Options part of an IP Header
//     There can be up to 10 32-bit words of options in an IP Header
typedef __declspec(packed) union onl_api_u_ip_hdr_options
{
    struct
    {
	unsigned int numOptions; // how may option words are there
        unsigned int option[10]; // array of actual options
    };
    unsigned int value[11];
} onl_api_ip_hdr_options;

typedef __declspec(packed) union onl_api_u_tcp_hdr
{
    struct
    {
        // Word0:
        unsigned int th_sport: 16;
        unsigned int th_dport: 16;

        // Word1:
        unsigned int th_seq;

        // Word2:
        unsigned int th_ack;

        // Word3:
        unsigned int th_off:  4;
        unsigned int th_x2:   4;
        unsigned int th_cwr:  1;
        unsigned int th_ece:  1;
        unsigned int th_urg:  1;
        unsigned int th_ackf: 1;
        unsigned int th_psh:  1;
        unsigned int th_rst:  1;
        unsigned int th_syn:  1;
        unsigned int th_fin:  1;
        unsigned int th_win:  16;

        // Word4:
        unsigned int th_sum: 16;
        unsigned int th_urp: 16;
    };
    unsigned int value[5];
} onl_api_tcp_hdr;

typedef __declspec(packed) union onl_api_u_udp_hdr
{
    struct
    {
        // Word0:
        unsigned int uh_sport: 16;
        unsigned int uh_dport: 16;

        // Word1:
        unsigned int uh_ulen:  16;
        unsigned int uh_sum:   16;
    };
    unsigned int value[2];
} onl_api_udp_hdr;


typedef __declspec(packed) union onl_api_u_buf_desc
{
    struct
    {
	// Word0:
	unsigned int bufferNext;

	// Word1:
	unsigned int bufferSize    : 16;
	unsigned int offset        : 16;

	// Word2:
	unsigned int packetSize    : 16;
	unsigned int freelistId    :  4;
	unsigned int reserved1     :  4;
	unsigned int refCnt        :  8;

	// Word3:
	unsigned int statsIndex    : 16;
	unsigned int macDAddr47_32 : 16;

	// Word4:
	unsigned int macDAddr31_00;

	// Word5:
	unsigned int etherType     : 16;
	unsigned int reserved2     : 16;

	// Word6:
	unsigned int reserved3;

	// Word7:
	unsigned int packetNext;
    };

    unsigned int value[8];
} onl_api_buf_desc;

// Queue Params Structure
typedef __declspec(packed) union onl_api_u_qparams
{
    struct
    {
        // Word0:
        unsigned int length;

        // Word1:
        unsigned int threshold;

        // Word2:
        unsigned int quantum;

        // Word3:
        unsigned int reserved;
    };
    unsigned int value[4];
} onl_api_qparams;

//-------------------------------------------------------------------
// Stats - read/write stats counters
//-------------------------------------------------------------------
void onl_api_plugin_cntr_inc(unsigned int  pid, unsigned int cid);
#define ONL_API_PCOUNT_INC(cid) (onl_api_plugin_cntr_inc(pluginId, cid))

void onl_api_plugin_cntr_add(unsigned int pid, unsigned int cid, unsigned int val);
#define ONL_API_PCOUNT_ADD(cid, val) (onl_api_plugin_cntr_add(pluginId, cid, val))

//-------------------------------------------------------------------
// Packet forwarding and dropping 
//-------------------------------------------------------------------
void onl_api_set_out_to_MUX(); // set dlNextBlock
void onl_api_set_out_to_QM();  // set dlNextBlock
void onl_api_set_out_to_XSCALE_LD();  // set dlNextBlock
void onl_api_set_out_to_XSCALE_EXC();  // set dlNextBlock
void onl_api_set_out_to_XSCALE_ERR();  // set dlNextBlock
void onl_api_set_out_to_PLUGIN(unsigned int pluginNum);  // set dlNextBlock
void onl_api_drop(); // drop packet


// globals
__declspec(gp_reg) unsigned int stats_regnum; // stats register number
__declspec(gp_reg) unsigned int stats_cerr;   // stats error counter

#define ONL_API_RING_IN_BUF_HANDLE               ring_in.buf_handle_lo24;
#define ONL_API_RING_IN_LAYER3_PKT_LENGTH        ring_in.l3_pkt_len;
#define ONL_API_RING_IN_QID                      ring_in.qid;
#define ONL_API_RING_IN_PLUGIN_TAG               ring_in.plugin_tag;
#define ONL_API_RING_IN_IN_PORT                  ring_in.in_port;
#define ONL_API_RING_IN_FLAGS                    ring_in.flags;
#define ONL_API_RING_IN_STATS_INDEX              ring_in.stats_index;
#define ONL_API_RING_IN_NEXT_HOP_ETH_DADDR_HI32  ring_in.nh_eth_daddr_hi32;
#define ONL_API_RING_IN_NEXT_HOP_ETH_DADDR_LO16  ring_in.nh_eth_daddr_lo16;
#define ONL_API_RING_IN_ETH_TYPE                 ring_in.eth_type;
#define ONL_API_RING_IN_UC_MC_BITS               ring_in.uc_mc_bits;

static __forceinline void get_pcntr(int pid, int cid) {
// return regnum,cntr_err based plugin id (pid)
// and plugin counter id (cid)
// (this is used by plugin_cntr_inc()...etc)
//
//  NOTE: we should do a range check on cid (counter id)
//        instead of set defaut to _CNTR_0 (10/02/07 scj1)
//
    switch(pid) {
        case 0: // plugin id
                stats_cerr = COUNTER_REQUESTS_DROPPED_PLUGIN_0;
                switch(cid) { // counter id 
                    case 0:  stats_regnum = ONL_ROUTER_PLUGIN_0_CNTR_0;  break;
                    case 1:  stats_regnum = ONL_ROUTER_PLUGIN_0_CNTR_1;  break;
                    case 2:  stats_regnum = ONL_ROUTER_PLUGIN_0_CNTR_2;  break;
                    case 3:  stats_regnum = ONL_ROUTER_PLUGIN_0_CNTR_3;  break;
                    default: stats_regnum = ONL_ROUTER_PLUGIN_0_CNTR_0;  break;
   
                }
                break;

        case 1: 
                stats_cerr = COUNTER_REQUESTS_DROPPED_PLUGIN_1;
                switch(cid) {
                    case 0:  stats_regnum = ONL_ROUTER_PLUGIN_1_CNTR_0;  break;
                    case 1:  stats_regnum = ONL_ROUTER_PLUGIN_1_CNTR_1;  break;
                    case 2:  stats_regnum = ONL_ROUTER_PLUGIN_1_CNTR_2;  break;
                    case 3:  stats_regnum = ONL_ROUTER_PLUGIN_1_CNTR_3;  break;
                    default: stats_regnum = ONL_ROUTER_PLUGIN_1_CNTR_0;  break;
   
                }
                break;

        case 2: 
                stats_cerr = COUNTER_REQUESTS_DROPPED_PLUGIN_2;
                switch(cid) {
                    case 0:  stats_regnum = ONL_ROUTER_PLUGIN_2_CNTR_0;  break;
                    case 1:  stats_regnum = ONL_ROUTER_PLUGIN_2_CNTR_1;  break;
                    case 2:  stats_regnum = ONL_ROUTER_PLUGIN_2_CNTR_2;  break;
                    case 3:  stats_regnum = ONL_ROUTER_PLUGIN_2_CNTR_3;  break;
                    default: stats_regnum = ONL_ROUTER_PLUGIN_2_CNTR_0;  break;
   
                }
                break;

        case 3: 
                stats_cerr = COUNTER_REQUESTS_DROPPED_PLUGIN_3;
                switch(cid) {
                    case 0:  stats_regnum = ONL_ROUTER_PLUGIN_3_CNTR_0;  break;
                    case 1:  stats_regnum = ONL_ROUTER_PLUGIN_3_CNTR_1;  break;
                    case 2:  stats_regnum = ONL_ROUTER_PLUGIN_3_CNTR_2;  break;
                    case 3:  stats_regnum = ONL_ROUTER_PLUGIN_3_CNTR_3;  break;
                    default: stats_regnum = ONL_ROUTER_PLUGIN_3_CNTR_0;  break;
   
                }
                break;

        case 4: 
                stats_cerr = COUNTER_REQUESTS_DROPPED_PLUGIN_4;
                switch(cid) {
                    case 0:  stats_regnum = ONL_ROUTER_PLUGIN_4_CNTR_0;  break;
                    case 1:  stats_regnum = ONL_ROUTER_PLUGIN_4_CNTR_1;  break;
                    case 2:  stats_regnum = ONL_ROUTER_PLUGIN_4_CNTR_2;  break;
                    case 3:  stats_regnum = ONL_ROUTER_PLUGIN_4_CNTR_3;  break;
                    default: stats_regnum = ONL_ROUTER_PLUGIN_4_CNTR_0;  break;
   
                }
                break;
        default:
                stats_cerr = COUNTER_REQUESTS_DROPPED_PLUGIN_0;
                switch(cid) {
                    case 0:  stats_regnum = ONL_ROUTER_PLUGIN_0_CNTR_0;  break;
                    case 1:  stats_regnum = ONL_ROUTER_PLUGIN_0_CNTR_1;  break;
                    case 2:  stats_regnum = ONL_ROUTER_PLUGIN_0_CNTR_2;  break;
                    case 3:  stats_regnum = ONL_ROUTER_PLUGIN_0_CNTR_3;  break;
                    default: stats_regnum = ONL_ROUTER_PLUGIN_0_CNTR_0;  break;
   
                }
                break;
    }

}


void onl_api_plugin_cntr_inc(unsigned int pid, unsigned int cid)
{
    get_pcntr(pid, cid);
    WU_incrementGlobalRegister(stats_regnum, stats_cerr); // see counter_util.c
}


void onl_api_plugin_cntr_add(unsigned int pid, unsigned int cid, unsigned int val)
{
    get_pcntr(pid, cid);
    WU_incrementAndAddGlobalRegister(stats_regnum, val, stats_cerr);
}


void onl_api_set_out_to_MUX() {
// set dlNextBlock to QM block id
    dlNextBlock = MUX;
}


void onl_api_set_out_to_QM() {
// set dlNextBlock to QM block id
    dlNextBlock = QM;
}

void onl_api_set_out_to_XSCALE_LD() {
// set dlNextBlock to XSCALE LD block id
    dlNextBlock = XSCALE_LD;
}
void onl_api_set_out_to_XSCALE_EXC() {
// set dlNextBlock to XSCALE EXception block id
    dlNextBlock = XSCALE_EXC;
}
void onl_api_set_out_to_XSCALE_ERR() {
// set dlNextBlock to XSCALE ERRor block id
    dlNextBlock = XSCALE_ERR;
}

void onl_api_set_out_to_PLUGIN(unsigned int pluginNum) {
// set dlNextBlock to Plugin number pluginNum
    switch(pluginNum) {
    case 0:
	dlNextBlock = PACKET_IN_RING_0;
	break;
    case 1:
	dlNextBlock = PACKET_IN_RING_1;
	break;
    case 2:
	dlNextBlock = PACKET_IN_RING_2;
	break;
    case 3:
	dlNextBlock = PACKET_IN_RING_3;
	break;
    case 4:
	dlNextBlock = PACKET_IN_RING_4;
	break;
    default:
	dlNextBlock = DROP;
	break;
    }
}


void onl_api_drop() {
// Drop a packet.
// send it to the freeListManger block.
    dlNextBlock = DROP;
}

__forceinline void onl_api_allocate_buffer(__declspec(sram_read_reg) buf_handle_t *buf_handle)
{
   SIGNAL sram_sig;
         Dl_BufAlloc(buf_handle, BUF_FREE_LIST0, BUF_SDRAM_BASE, BUFFER_SIZE, BUF_SRAM_BASE, BUF_SRAM_SIZE, &sram_sig, sig_done, ___); 
         __wait_for_all(&sram_sig);
}

__forceinline void onl_api_get_buf_handle(__declspec(gp_reg) buf_handle_t *buf_handle)
{
 // get next buffer handle 
    buf_handle->value = ring_in.buf_handle_lo24;
}


__forceinline unsigned int onl_api_getBufferDescriptorPtr(dl_buf_handle_t buf_handle)
{
	unsigned int bufferDescriptorPtr;
	bufferDescriptorPtr = Dl_BufGetDesc(buf_handle);
	return(bufferDescriptorPtr);
}

// Read a buffer descriptor from SRAM into a struct. 
// Descriptors are ALWAYS aligned so we don't have to worry about alignment oddities at all.
__forceinline unsigned int onl_api_readBufferDescriptor(unsigned int bufDescPtr, __declspec(gp_reg) onl_api_buf_desc *descStructPtr)
{
	SIGNAL sram_sig ;
	__declspec(sram_read_reg)  unsigned int sram_rd_regs[8];

	sram_read(&sram_rd_regs, (__declspec(sram) void *) bufDescPtr, 8, ctx_swap, &sram_sig);

	descStructPtr->value[0] = sram_rd_regs[0];
	descStructPtr->value[1] = sram_rd_regs[1];
	descStructPtr->value[2] = sram_rd_regs[2];
	descStructPtr->value[3] = sram_rd_regs[3];
	descStructPtr->value[4] = sram_rd_regs[4];
	descStructPtr->value[5] = sram_rd_regs[5];
	descStructPtr->value[6] = sram_rd_regs[6];
	descStructPtr->value[7] = sram_rd_regs[7];
	return (1);
}

// Write a buffer descriptor from a struct into SRAM. 
// Descriptors are ALWAYS aligned so we don't have to worry about alignment oddities at all.
unsigned int onl_api_writeBufferDescriptor(unsigned int bufDescPtr, __declspec(gp_reg) onl_api_buf_desc *descStructPtr)
{
	SIGNAL sram_sig ;
	__declspec(sram_write_reg)  unsigned int sram_wr_regs[8];

	sram_wr_regs[0] = descStructPtr->value[0] ;
	sram_wr_regs[1] = descStructPtr->value[1] ;
	sram_wr_regs[2] = descStructPtr->value[2] ;
	sram_wr_regs[3] = descStructPtr->value[3] ;
	sram_wr_regs[4] = descStructPtr->value[4] ;
	sram_wr_regs[5] = descStructPtr->value[5] ;
	sram_wr_regs[6] = descStructPtr->value[6] ;
	sram_wr_regs[7] = descStructPtr->value[7] ;

	sram_write(&sram_wr_regs, (__declspec(sram) void *) bufDescPtr, 8, ctx_swap, &sram_sig);

	return (1);

}
__forceinline unsigned int onl_api_getBufferPtr(dl_buf_handle_t buf_handle)
{
	unsigned int bufferPtr;
	bufferPtr = Dl_BufGetData(buf_handle);
	return(bufferPtr);
}
__forceinline unsigned int onl_api_getIpv4HdrPtr(unsigned int dramBufferPtr, unsigned int offset)
{
	unsigned int ipv4HdrPtr;
	ipv4HdrPtr = dramBufferPtr + offset;
	return (ipv4HdrPtr);
}

__forceinline unsigned int onl_api_readIpv4Hdr(unsigned int ipHdrPtr, __declspec(gp_reg) onl_api_ip_hdr *ipv4_hdr_ptr)
{
	SIGNAL_PAIR sp ;
	unsigned int unalignAmount;
	__declspec(dram_read_reg) unsigned int dram_rd_regs[8]; // to store IP header from dram 


	unalignAmount = (ipHdrPtr & 7); // Should be 0, 2, 4, or 6

	switch (unalignAmount) 
	{
		case 0:
			// We are aligned
			dram_read(dram_rd_regs, (__declspec(dram) void*) ipHdrPtr, 3, sig_done, &sp);
			__wait_for_all(&sp);
			
			ipv4_hdr_ptr->value[0] = dram_rd_regs[0];
			ipv4_hdr_ptr->value[1] = dram_rd_regs[1];
			ipv4_hdr_ptr->value[2] = dram_rd_regs[2];
			ipv4_hdr_ptr->value[3] = dram_rd_regs[3];
			ipv4_hdr_ptr->value[4] = dram_rd_regs[4];

			break;
		case 2:
			// we are off by two bytes.  Do a bunch of shifting.
			dram_read(dram_rd_regs, (__declspec(dram) void*) ipHdrPtr, 3, sig_done, &sp);
			__wait_for_all(&sp);
			
			ipv4_hdr_ptr->value[0] = dbl_shl(dram_rd_regs[0], dram_rd_regs[1], 16);
			ipv4_hdr_ptr->value[1] = dbl_shl(dram_rd_regs[1], dram_rd_regs[2], 16);
			ipv4_hdr_ptr->value[2] = dbl_shl(dram_rd_regs[2], dram_rd_regs[3], 16);
			ipv4_hdr_ptr->value[3] = dbl_shl(dram_rd_regs[3], dram_rd_regs[4], 16);
			ipv4_hdr_ptr->value[4] = dbl_shl(dram_rd_regs[4], dram_rd_regs[5], 16);

			break;
		case 4:
			// we are off by one word. Just skip dram_rd_regs[0];
			dram_read(dram_rd_regs, (__declspec(dram) void*) ipHdrPtr, 3, sig_done, &sp);
			__wait_for_all(&sp);
			
			ipv4_hdr_ptr->value[0] = dram_rd_regs[1];
			ipv4_hdr_ptr->value[1] = dram_rd_regs[2];
			ipv4_hdr_ptr->value[2] = dram_rd_regs[3];
			ipv4_hdr_ptr->value[3] = dram_rd_regs[4];
			ipv4_hdr_ptr->value[4] = dram_rd_regs[5];

			break;
		case 6:
			// we are off by six bytes.  We need to read an extra double word and 
			// skip dram_rd_regs[0] and do a bunch of shifting.
			dram_read(dram_rd_regs, (__declspec(dram) void*) ipHdrPtr, 4, sig_done, &sp);
			__wait_for_all(&sp);
			
			ipv4_hdr_ptr->value[0] = dbl_shl(dram_rd_regs[1], dram_rd_regs[2], 16);
			ipv4_hdr_ptr->value[1] = dbl_shl(dram_rd_regs[2], dram_rd_regs[3], 16);
			ipv4_hdr_ptr->value[2] = dbl_shl(dram_rd_regs[3], dram_rd_regs[4], 16);
			ipv4_hdr_ptr->value[3] = dbl_shl(dram_rd_regs[4], dram_rd_regs[5], 16);
			ipv4_hdr_ptr->value[4] = dbl_shl(dram_rd_regs[5], dram_rd_regs[6], 16);

			break;
		default:
			break;
	}
	return (1);
}

#define IPV4_HEADER_BASE_HDR_SIZE 20

// Reads IP Options into a struct and returns the number of options
__forceinline unsigned int onl_api_readIpv4HdrOptions(unsigned int ipHdrPtr, unsigned int iphl, __declspec(gp_reg) onl_api_ip_hdr_options *ipv4_hdr_options_ptr)
{
	SIGNAL_PAIR sp ;
	unsigned int unalignAmount;
	__declspec(dram_read_reg) unsigned int dram_rd_regs[12]; // to store IP header options from dram 
	unsigned int numOptions;
	unsigned int ipHdrOptionsPtr;
	unsigned int numDWordsToRead;

	numOptions = iphl - 5;
	if (numOptions == 0) return 0;
	unalignAmount = (ipHdrPtr & 7); // Should be 0, 2, 4, or 6
	ipHdrOptionsPtr = ipHdrPtr + IPV4_HEADER_BASE_HDR_SIZE;

	// The MINIMUM number of DOUBLE-WORDS we have to read is a function of
	// the number of options (0-10) and the unalignAmount (0,2,4,6)
 	switch (unalignAmount)
	{
	    case 0:
		numDWordsToRead = (numOptions + 1) >> 1;
	        break;
	    case 2:
	    case 4:
		numDWordsToRead = (numOptions + 2) >> 1;
	        break;
	    case 6:
		numDWordsToRead = (numOptions + 3) >> 1;
	        break;
	}
	switch (numDWordsToRead) 
	{
	    case 1:
		dram_read(dram_rd_regs, (__declspec(dram) void*) ipHdrOptionsPtr, 1, sig_done, &sp);
		__wait_for_all(&sp);
		break;
	    case 2:
		dram_read(dram_rd_regs, (__declspec(dram) void*) ipHdrOptionsPtr, 2, sig_done, &sp);
		__wait_for_all(&sp);
		break;
	    case 3:
		dram_read(dram_rd_regs, (__declspec(dram) void*) ipHdrOptionsPtr, 3, sig_done, &sp);
		__wait_for_all(&sp);
		break;
	    case 4:
		dram_read(dram_rd_regs, (__declspec(dram) void*) ipHdrOptionsPtr, 4, sig_done, &sp);
		__wait_for_all(&sp);
		break;
	    case 5:
		dram_read(dram_rd_regs, (__declspec(dram) void*) ipHdrOptionsPtr, 5, sig_done, &sp);
		__wait_for_all(&sp);
		break;
	    case 6:
		dram_read(dram_rd_regs, (__declspec(dram) void*) ipHdrOptionsPtr, 6, sig_done, &sp);
		__wait_for_all(&sp);
		break;
	};


	switch (unalignAmount) 
	{
		case 0:
			// We are aligned
			
			if (numOptions > 0) ipv4_hdr_options_ptr->value[0]  = dram_rd_regs[0];
			if (numOptions > 1) ipv4_hdr_options_ptr->value[1]  = dram_rd_regs[1];
			if (numOptions > 2) ipv4_hdr_options_ptr->value[2]  = dram_rd_regs[2];
			if (numOptions > 3) ipv4_hdr_options_ptr->value[3]  = dram_rd_regs[3];
			if (numOptions > 4) ipv4_hdr_options_ptr->value[4]  = dram_rd_regs[4];
			if (numOptions > 5) ipv4_hdr_options_ptr->value[5]  = dram_rd_regs[5];
			if (numOptions > 6) ipv4_hdr_options_ptr->value[6]  = dram_rd_regs[6];
			if (numOptions > 7) ipv4_hdr_options_ptr->value[7]  = dram_rd_regs[7];
			if (numOptions > 8) ipv4_hdr_options_ptr->value[8]  = dram_rd_regs[8];
			if (numOptions > 9) ipv4_hdr_options_ptr->value[9]  = dram_rd_regs[9];

			break;
		case 2:
			// we are off by two bytes.  Do a bunch of shifting.
			
			if (numOptions > 0) ipv4_hdr_options_ptr->value[0]  = dbl_shl(dram_rd_regs[0], dram_rd_regs[1], 16);
			if (numOptions > 1) ipv4_hdr_options_ptr->value[1]  = dbl_shl(dram_rd_regs[1], dram_rd_regs[2], 16);
			if (numOptions > 2) ipv4_hdr_options_ptr->value[2]  = dbl_shl(dram_rd_regs[2], dram_rd_regs[3], 16);
			if (numOptions > 3) ipv4_hdr_options_ptr->value[3]  = dbl_shl(dram_rd_regs[3], dram_rd_regs[4], 16);
			if (numOptions > 4) ipv4_hdr_options_ptr->value[4]  = dbl_shl(dram_rd_regs[4], dram_rd_regs[5], 16);
			if (numOptions > 5) ipv4_hdr_options_ptr->value[5]  = dbl_shl(dram_rd_regs[5], dram_rd_regs[6], 16);
			if (numOptions > 6) ipv4_hdr_options_ptr->value[6]  = dbl_shl(dram_rd_regs[6], dram_rd_regs[7], 16);
			if (numOptions > 7) ipv4_hdr_options_ptr->value[7]  = dbl_shl(dram_rd_regs[7], dram_rd_regs[8], 16);
			if (numOptions > 8) ipv4_hdr_options_ptr->value[8]  = dbl_shl(dram_rd_regs[8], dram_rd_regs[9], 16);
			if (numOptions > 9) ipv4_hdr_options_ptr->value[9]  = dbl_shl(dram_rd_regs[9], dram_rd_regs[10], 16);

			break;
		case 4:
			// we are off by one word. Just skip dram_rd_regs[0];
			
			if (numOptions > 0) ipv4_hdr_options_ptr->value[0] = dram_rd_regs[1];
			if (numOptions > 1) ipv4_hdr_options_ptr->value[1] = dram_rd_regs[2];
			if (numOptions > 2) ipv4_hdr_options_ptr->value[2] = dram_rd_regs[3];
			if (numOptions > 3) ipv4_hdr_options_ptr->value[3] = dram_rd_regs[4];
			if (numOptions > 4) ipv4_hdr_options_ptr->value[4] = dram_rd_regs[5];
			if (numOptions > 5) ipv4_hdr_options_ptr->value[5] = dram_rd_regs[6];
			if (numOptions > 6) ipv4_hdr_options_ptr->value[6] = dram_rd_regs[7];
			if (numOptions > 7) ipv4_hdr_options_ptr->value[7] = dram_rd_regs[8];
			if (numOptions > 8) ipv4_hdr_options_ptr->value[8] = dram_rd_regs[9];
			if (numOptions > 9) ipv4_hdr_options_ptr->value[9] = dram_rd_regs[10];

			break;
		case 6:
			// we are off by six bytes.  We need to read an extra double word and 
			// skip dram_rd_regs[0] and do a bunch of shifting.
			
			if (numOptions > 0) ipv4_hdr_options_ptr->value[0] = dbl_shl(dram_rd_regs[1], dram_rd_regs[2], 16);
			if (numOptions > 1) ipv4_hdr_options_ptr->value[1] = dbl_shl(dram_rd_regs[2], dram_rd_regs[3], 16);
			if (numOptions > 2) ipv4_hdr_options_ptr->value[2] = dbl_shl(dram_rd_regs[3], dram_rd_regs[4], 16);
			if (numOptions > 3) ipv4_hdr_options_ptr->value[3] = dbl_shl(dram_rd_regs[4], dram_rd_regs[5], 16);
			if (numOptions > 4) ipv4_hdr_options_ptr->value[4] = dbl_shl(dram_rd_regs[5], dram_rd_regs[6], 16);
			if (numOptions > 5) ipv4_hdr_options_ptr->value[5] = dbl_shl(dram_rd_regs[6], dram_rd_regs[7], 16);
			if (numOptions > 6) ipv4_hdr_options_ptr->value[6] = dbl_shl(dram_rd_regs[7], dram_rd_regs[8], 16);
			if (numOptions > 7) ipv4_hdr_options_ptr->value[7] = dbl_shl(dram_rd_regs[8], dram_rd_regs[9], 16);
			if (numOptions > 8) ipv4_hdr_options_ptr->value[8] = dbl_shl(dram_rd_regs[9], dram_rd_regs[10], 16);
			if (numOptions > 9) ipv4_hdr_options_ptr->value[9] = dbl_shl(dram_rd_regs[10], dram_rd_regs[11], 16);

			break;
		default:
			break;
	}
	return (numOptions);
}

unsigned int onl_api_getTcpHdrPtr(unsigned int ipHdrPtr, unsigned int iphl)
{
	unsigned int tcpHdrPtr;
	tcpHdrPtr = ipHdrPtr + (iphl << 2);
	return (tcpHdrPtr);
}
__forceinline unsigned int onl_api_readTcpHdr(unsigned int tcpHdrPtr, __declspec(gp_reg) onl_api_tcp_hdr *tcp_hdr_ptr)
{
	SIGNAL_PAIR sp ;
	unsigned int unalignAmount;
	__declspec(dram_read_reg) unsigned int dram_rd_regs[8]; // to store TCP header from dram 


	unalignAmount = (tcpHdrPtr & 7); // Should be 0, 2, 4, or 6

	switch (unalignAmount) 
	{
		case 0:
			// We are aligned
			dram_read(dram_rd_regs, (__declspec(dram) void*) tcpHdrPtr, 3, sig_done, &sp);
			__wait_for_all(&sp);
			
			tcp_hdr_ptr->value[0] = dram_rd_regs[0];
			tcp_hdr_ptr->value[1] = dram_rd_regs[1];
			tcp_hdr_ptr->value[2] = dram_rd_regs[2];
			tcp_hdr_ptr->value[3] = dram_rd_regs[3];
			tcp_hdr_ptr->value[4] = dram_rd_regs[4];

			break;
		case 2:
			// we are off by two bytes.  Do a bunch of shifting.
			dram_read(dram_rd_regs, (__declspec(dram) void*) tcpHdrPtr, 3, sig_done, &sp);
			__wait_for_all(&sp);
			
			tcp_hdr_ptr->value[0] = dbl_shl(dram_rd_regs[0], dram_rd_regs[1], 16);
			tcp_hdr_ptr->value[1] = dbl_shl(dram_rd_regs[1], dram_rd_regs[2], 16);
			tcp_hdr_ptr->value[2] = dbl_shl(dram_rd_regs[2], dram_rd_regs[3], 16);
			tcp_hdr_ptr->value[3] = dbl_shl(dram_rd_regs[3], dram_rd_regs[4], 16);
			tcp_hdr_ptr->value[4] = dbl_shl(dram_rd_regs[4], dram_rd_regs[5], 16);

			break;
		case 4:
			// we are off by one word. Just skip dram_rd_regs[0];
			dram_read(dram_rd_regs, (__declspec(dram) void*) tcpHdrPtr, 3, sig_done, &sp);
			__wait_for_all(&sp);
			
			tcp_hdr_ptr->value[0] = dram_rd_regs[1];
			tcp_hdr_ptr->value[1] = dram_rd_regs[2];
			tcp_hdr_ptr->value[2] = dram_rd_regs[3];
			tcp_hdr_ptr->value[3] = dram_rd_regs[4];
			tcp_hdr_ptr->value[4] = dram_rd_regs[5];

			break;
		case 6:
			// we are off by six bytes.  We need to read an extra double word and 
			// skip dram_rd_regs[0] and do a bunch of shifting.
			dram_read(dram_rd_regs, (__declspec(dram) void*) tcpHdrPtr, 4, sig_done, &sp);
			__wait_for_all(&sp);
			
			tcp_hdr_ptr->value[0] = dbl_shl(dram_rd_regs[1], dram_rd_regs[2], 16);
			tcp_hdr_ptr->value[1] = dbl_shl(dram_rd_regs[2], dram_rd_regs[3], 16);
			tcp_hdr_ptr->value[2] = dbl_shl(dram_rd_regs[3], dram_rd_regs[4], 16);
			tcp_hdr_ptr->value[3] = dbl_shl(dram_rd_regs[4], dram_rd_regs[5], 16);
			tcp_hdr_ptr->value[4] = dbl_shl(dram_rd_regs[5], dram_rd_regs[6], 16);

			break;
		default:
			break;
	}
	return (1);
}
unsigned int onl_api_getUdpHdrPtr(unsigned int ipHdrPtr, unsigned int iphl)
{
	unsigned int udpHdrPtr;
	udpHdrPtr = ipHdrPtr + (iphl << 2);
	return (udpHdrPtr);
}
__forceinline unsigned int onl_api_readUdpHdr(unsigned int udpHdrPtr, __declspec(gp_reg) onl_api_udp_hdr *udp_hdr_ptr)
{
	SIGNAL_PAIR sp ;
	unsigned int unalignAmount;
	__declspec(dram_read_reg) unsigned int dram_rd_regs[4]; // to store UDP header from dram 


	unalignAmount = (udpHdrPtr & 7); // Should be 0, 2, 4, or 6

	switch (unalignAmount) 
	{
		case 0:
			// We are aligned
			dram_read(dram_rd_regs, (__declspec(dram) void*) udpHdrPtr, 1, sig_done, &sp);
			__wait_for_all(&sp);
			
			udp_hdr_ptr->value[0] = dram_rd_regs[0];
			udp_hdr_ptr->value[1] = dram_rd_regs[1];

			break;
		case 2:
			// we are off by two bytes.  Do a bunch of shifting.
			dram_read(dram_rd_regs, (__declspec(dram) void*) udpHdrPtr, 2, sig_done, &sp);
			__wait_for_all(&sp);
			
			udp_hdr_ptr->value[0] = dbl_shl(dram_rd_regs[0], dram_rd_regs[1], 16);
			udp_hdr_ptr->value[1] = dbl_shl(dram_rd_regs[1], dram_rd_regs[2], 16);

			break;
		case 4:
			// we are off by one word. Just skip dram_rd_regs[0];
			dram_read(dram_rd_regs, (__declspec(dram) void*) udpHdrPtr, 2, sig_done, &sp);
			__wait_for_all(&sp);
			
			udp_hdr_ptr->value[0] = dram_rd_regs[1];
			udp_hdr_ptr->value[1] = dram_rd_regs[2];

			break;
		case 6:
			// we are off by six bytes.  We need to read an extra double word and 
			// skip dram_rd_regs[0] and do a bunch of shifting.
			dram_read(dram_rd_regs, (__declspec(dram) void*) udpHdrPtr, 2, sig_done, &sp);
			__wait_for_all(&sp);
			
			udp_hdr_ptr->value[0] = dbl_shl(dram_rd_regs[1], dram_rd_regs[2], 16);
			udp_hdr_ptr->value[1] = dbl_shl(dram_rd_regs[2], dram_rd_regs[3], 16);

			break;
		default:
			break;
	}
	return (1);
}


typedef struct _sixlong  //14B, 22B
{
    unsigned int i[6];
} sixlong;

typedef struct _onelong  
{
    unsigned int i[1];
} onelong;
typedef struct _twolong  
{
    unsigned int i[2];
} twolong;
typedef struct _fourlong  
{
    unsigned int i[4];
} fourlong;
typedef struct _eightlong  
{
    unsigned int i[8];
} eightlong;

typedef struct _sixteenlong  
{
    unsigned int i[16];
} sixteenlong;


// Perform a 8B write to DRAM where the address is not necessarily aligned
__forceinline void onl_api_ua_write_8B_dram(unsigned int addr, void *val)
{
   __declspec(dram_read_reg) unsigned int dram_read_regs[4];
   __declspec(dram_write_reg) unsigned int dram_write_regs[8];
   SIGNAL_PAIR sp;
   sixlong *v;
   unsigned tail[2];
   unsigned head[2];

   if ((addr & 7) != 0) {
      dram_read(dram_read_regs, (__declspec(dram) void *)addr, 2, sig_done, &sp);
      __wait_for_all(&sp);
      head[0] = dram_read_regs[0];
      head[1] = dram_read_regs[1];
      tail[0] = dram_read_regs[2];
      tail[1] = dram_read_regs[3];
   }

   v = (sixlong *) val;


   // Big endian only
   switch((unsigned int)addr & 7) {
      case 0: goto start_offset_0;
      case 2: goto start_offset_2;
      case 4: goto start_offset_4;
      case 6: goto start_offset_6;
      default: break;
   }
 
   start_offset_0:
         // We don't need anything extra
         dram_write_regs[0] = v->i[0];
         dram_write_regs[1] = v->i[1];

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 1, sig_done, &sp);
         __wait_for_all(&sp);
         return;


   start_offset_2:
         // Get top two bytes from head, and last 6 bytes come from tail.
         dram_write_regs[0] = ((__declspec(dram) unsigned int)(head[0] & 0xFFFF0000)) |  (v->i[0] >> 16);
         dram_write_regs[1] = dbl_shr(v->i[0], v->i[1], 16);
         dram_write_regs[2] = ((v->i[1] << 16) & 0xffff0000) | ((__declspec(dram) unsigned int) (0xffff & tail[0]));
	 dram_write_regs[3] = (__declspec(dram) unsigned int) tail[1];

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 2, sig_done, &sp);
         __wait_for_all(&sp);
         return;

    start_offset_4:
         // get top 4 bytes from Head and last 4 bytes from tail
         dram_write_regs[0] = (__declspec(dram) unsigned int) head[0];
         dram_write_regs[1] = v->i[0];
         dram_write_regs[2] = v->i[1];
         dram_write_regs[3] = (__declspec(dram) unsigned int) tail[1];

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 2, sig_done, &sp);
         __wait_for_all(&sp);
         return;

   start_offset_6:
         // Get top 6 bytes from Head, fill in last 2 bytes from tail
         dram_write_regs[0] = (__declspec(dram) unsigned int) head[0];
         dram_write_regs[1] = ((__declspec(dram) unsigned int)(head[1] & 0xFFFF0000)) | (v->i[0] >> 16);
         dram_write_regs[2] = dbl_shr(v->i[0], v->i[1], 16);
         dram_write_regs[3] = ((v->i[1]<<16) & 0xffff0000) | ((__declspec(dram) unsigned int) (0xffff & (tail[1])));

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 2, sig_done, &sp);
         __wait_for_all(&sp);
         return;
}

// Perform a 20B write to DRAM where the address is not necessarily aligned
__forceinline void onl_api_ua_write_20B_dram(unsigned int addr, void *val)
{
   __declspec(dram_read_reg) unsigned int dram_readhead_regs[2], dram_readtail_regs[2];
   __declspec(dram_write_reg) unsigned int dram_write_regs[8];
   SIGNAL_PAIR sp, sp2;
   sixlong *v;
   unsigned int addr_tail;
   unsigned tail[2];
   unsigned head[2];


   // We can do this a little more efficiently, with just one dram read
   // For 20B it probably makes sense to do that.
   // What we would do is that for 
   //        addr&7 == 0 we would read just the tail
   //        addr&7 == 4 we would read just the head
   //        addr&7 == 2 or == 6 we would read 32 Bytes, $00 and $01 would be head and $6 and $7 would be tail
   // we'll add that later.
   addr_tail = (unsigned int)addr + 20;

   if ((addr_tail & 7) != 4) {
      dram_read(dram_readtail_regs, (__declspec(dram) void *)addr_tail, 1, sig_done, &sp);
   }

   if ((addr & 7) != 0) {
      dram_read(dram_readhead_regs, (__declspec(dram) void *)addr, 1, sig_done, &sp2);
   }
   if ((addr_tail & 7) != 4) {
       if ((addr & 7) != 0) {
	   wait_for_all(&sp, &sp2);
	   head[0] = dram_readhead_regs[0];
	   head[1] = dram_readhead_regs[1];
       }
       else {
	   wait_for_all(&sp);
       }
       tail[0] = dram_readtail_regs[0];
       tail[1] = dram_readtail_regs[1];
   }
   else {
       if ((addr & 7) != 0) {
	   wait_for_all(&sp2);
	   head[0] = dram_readhead_regs[0];
	   head[1] = dram_readhead_regs[1];
       }
   }

   v = (sixlong *) val;


   // Big endian only
   switch((unsigned int)addr & 7) {
      case 0: goto start_offset_0;
      case 2: goto start_offset_2;
      case 4: goto start_offset_4;
      case 6: goto start_offset_6;
      default: break;
   }
 
   start_offset_0:
         // dram read above will read 2 words, the second is what we want.
         dram_write_regs[0] = v->i[0];
         dram_write_regs[1] = v->i[1];
         dram_write_regs[2] = v->i[2];
         dram_write_regs[3] = v->i[3];
         dram_write_regs[4] = v->i[4];
         dram_write_regs[5] = (__declspec(dram) unsigned int) tail[1];

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 3, sig_done, &sp);
         __wait_for_all(&sp);
         return;


   start_offset_2:
         // Get top 2 bytes from head, and last two bytes come from tail
         dram_write_regs[0] = ((__declspec(dram) unsigned int)(head[0] & 0xFFFF0000)) |  (v->i[0] >> 16);
         dram_write_regs[1] = dbl_shr(v->i[0], v->i[1], 16);
         dram_write_regs[2] = dbl_shr(v->i[1], v->i[2], 16);
         dram_write_regs[3] = dbl_shr(v->i[2], v->i[3], 16);
         dram_write_regs[4] = dbl_shr(v->i[3], v->i[4], 16);
         dram_write_regs[5] = ((v->i[4] << 16) & 0xffff0000) | ((__declspec(dram) unsigned int) (0xffff & tail[1]));

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 3, sig_done, &sp);
         __wait_for_all(&sp);
         return;

    start_offset_4:
         // Get top 4 bytes from head.
         dram_write_regs[0] = (__declspec(dram) unsigned int) head[0];
         dram_write_regs[1] = v->i[0];
         dram_write_regs[2] = v->i[1];
         dram_write_regs[3] = v->i[2];
         dram_write_regs[4] = v->i[3];
         dram_write_regs[5] = v->i[4];

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 3, sig_done, &sp);
         __wait_for_all(&sp);
         return;

   start_offset_6:
         // Get top 6 bytes from head, fill in last 6 bytes from tail
         dram_write_regs[0] = (__declspec(dram) unsigned int) head[0];
         dram_write_regs[1] = ((__declspec(dram) unsigned int)(head[1] & 0xFFFF0000)) | (v->i[0] >> 16);
         dram_write_regs[2] = dbl_shr(v->i[0], v->i[1], 16);
         dram_write_regs[3] = dbl_shr(v->i[1], v->i[2], 16);
         dram_write_regs[4] = dbl_shr(v->i[2], v->i[3], 16);
         dram_write_regs[5] = dbl_shr(v->i[3], v->i[4], 16);
         dram_write_regs[6] = ((v->i[4]<<16) & 0xffff0000) | ((__declspec(dram) unsigned int) (0xffff & (tail[0])));
         dram_write_regs[7] = (__declspec(dram) unsigned int) tail[1];

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 4, sig_done, &sp);
         __wait_for_all(&sp);
         return;
}

// Perform an 8W (32B) write to DRAM where the address is not necessarily aligned
__forceinline void onl_api_ua_write_8W_dram(unsigned int addr, void *val)
{
   __declspec(dram_read_reg) unsigned int dram_readhead_regs[2], dram_readtail_regs[2];
   __declspec(dram_write_reg) unsigned int dram_write_regs[10];
   SIGNAL_PAIR sp, sp2;
   sixteenlong *v;
   unsigned int addr_tail;
   unsigned tail[2];
   unsigned head[2];


   addr_tail = (unsigned int)addr + 32;

   if ((addr & 7) != 0) {
      dram_read(dram_readtail_regs, (__declspec(dram) void *)addr_tail, 1, sig_done, &sp);
      dram_read(dram_readhead_regs, (__declspec(dram) void *)addr, 1, sig_done, &sp2);
   }
   if ((addr & 7) != 0) {
       wait_for_all(&sp, &sp2);
       head[0] = dram_readhead_regs[0];
       head[1] = dram_readhead_regs[1];
       tail[0] = dram_readtail_regs[0];
       tail[1] = dram_readtail_regs[1];
   }

   v = (sixteenlong *) val;


   // Big endian only
   switch((unsigned int)addr & 7) {
      case 0: goto start_offset_0;
      case 2: goto start_offset_2;
      case 4: goto start_offset_4;
      case 6: goto start_offset_6;
      default: break;
   }
 
   start_offset_0:
         // We are aligned. Nothing special to do.
         dram_write_regs[0] = v->i[0];
         dram_write_regs[1] = v->i[1];
         dram_write_regs[2] = v->i[2];
         dram_write_regs[3] = v->i[3];
         dram_write_regs[4] = v->i[4];
         dram_write_regs[5] = v->i[5];
         dram_write_regs[6] = v->i[6];
         dram_write_regs[7] = v->i[7];

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 4, sig_done, &sp);
         __wait_for_all(&sp);
         return;


   start_offset_2:
         // Get top 2 bytes from head, and last 6 bytes come from tail
         dram_write_regs[0] = ((__declspec(dram) unsigned int)(head[0] & 0xFFFF0000)) |  (v->i[0] >> 16);
         dram_write_regs[1] = dbl_shr(v->i[0], v->i[1], 16);
         dram_write_regs[2] = dbl_shr(v->i[1], v->i[2], 16);
         dram_write_regs[3] = dbl_shr(v->i[2], v->i[3], 16);
         dram_write_regs[4] = dbl_shr(v->i[3], v->i[4], 16);
         dram_write_regs[5] = dbl_shr(v->i[4], v->i[5], 16);
         dram_write_regs[6] = dbl_shr(v->i[5], v->i[6], 16);
         dram_write_regs[7] = dbl_shr(v->i[6], v->i[7], 16);
         dram_write_regs[8] = ((v->i[7] << 16) & 0xffff0000) | ((__declspec(dram) unsigned int) (0xffff & tail[0]));
         dram_write_regs[9] = (__declspec(dram) unsigned int) tail[1];

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 5, sig_done, &sp);
         __wait_for_all(&sp);
         return;

    start_offset_4:
         // Get top 4 bytes from head.
         dram_write_regs[0] = (__declspec(dram) unsigned int) head[0];
         dram_write_regs[1] = v->i[0];
         dram_write_regs[2] = v->i[1];
         dram_write_regs[3] = v->i[2];
         dram_write_regs[4] = v->i[3];
         dram_write_regs[5] = v->i[4];
         dram_write_regs[6] = v->i[5];
         dram_write_regs[7] = v->i[6];
         dram_write_regs[8] = v->i[7];
         dram_write_regs[9] = (__declspec(dram) unsigned int) tail[1];

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 5, sig_done, &sp);
         __wait_for_all(&sp);
         return;

   start_offset_6:
         // Get top 6 bytes from head, fill in last 6 bytes from tail
         dram_write_regs[0] = (__declspec(dram) unsigned int) head[0];
         dram_write_regs[1] = ((__declspec(dram) unsigned int)(head[1] & 0xFFFF0000)) | (v->i[0] >> 16);
         dram_write_regs[2] = dbl_shr(v->i[0], v->i[1], 16);
         dram_write_regs[3] = dbl_shr(v->i[1], v->i[2], 16);
         dram_write_regs[4] = dbl_shr(v->i[2], v->i[3], 16);
         dram_write_regs[5] = dbl_shr(v->i[3], v->i[4], 16);
         dram_write_regs[6] = dbl_shr(v->i[4], v->i[5], 16);
         dram_write_regs[7] = dbl_shr(v->i[5], v->i[6], 16);
         dram_write_regs[8] = dbl_shr(v->i[6], v->i[7], 16);
         dram_write_regs[9] = ((v->i[7]<<16) & 0xffff0000) | ((__declspec(dram) unsigned int) (0xffff & (tail[1])));

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 5, sig_done, &sp);
         __wait_for_all(&sp);
         return;
}

// Perform an 4W (16B) write to DRAM where the address is not necessarily aligned
__forceinline void onl_api_ua_write_4W_dram(unsigned int addr, void *val)
{
   __declspec(dram_read_reg) unsigned int dram_readhead_regs[2], dram_readtail_regs[2];
   __declspec(dram_write_reg) unsigned int dram_write_regs[6];
   SIGNAL_PAIR sp, sp2;
   sixteenlong *v;
   unsigned int addr_tail;
   unsigned tail[2];
   unsigned head[2];


   addr_tail = (unsigned int)addr + 16;

   if ((addr & 7) != 0) {
      dram_read(dram_readtail_regs, (__declspec(dram) void *)addr_tail, 1, sig_done, &sp);
      dram_read(dram_readhead_regs, (__declspec(dram) void *)addr, 1, sig_done, &sp2);
   }
   if ((addr & 7) != 0) {
       wait_for_all(&sp, &sp2);
       head[0] = dram_readhead_regs[0];
       head[1] = dram_readhead_regs[1];
       tail[0] = dram_readtail_regs[0];
       tail[1] = dram_readtail_regs[1];
   }

   v = (sixteenlong *) val;


   // Big endian only
   switch((unsigned int)addr & 7) {
      case 0: goto start_offset_0;
      case 2: goto start_offset_2;
      case 4: goto start_offset_4;
      case 6: goto start_offset_6;
      default: break;
   }
 
   start_offset_0:
         // We are aligned. Nothing special to do.
         dram_write_regs[0] = v->i[0];
         dram_write_regs[1] = v->i[1];
         dram_write_regs[2] = v->i[2];
         dram_write_regs[3] = v->i[3];

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 2, sig_done, &sp);
         __wait_for_all(&sp);
         return;


   start_offset_2:
         // Get top 2 bytes from head, and last 6 bytes come from tail
         dram_write_regs[0] = ((__declspec(dram) unsigned int)(head[0] & 0xFFFF0000)) |  (v->i[0] >> 16);
         dram_write_regs[1] = dbl_shr(v->i[0], v->i[1], 16);
         dram_write_regs[2] = dbl_shr(v->i[1], v->i[2], 16);
         dram_write_regs[3] = dbl_shr(v->i[2], v->i[3], 16);
         dram_write_regs[4] = ((v->i[3] << 16) & 0xffff0000) | ((__declspec(dram) unsigned int) (0xffff & tail[0]));
         dram_write_regs[5] = (__declspec(dram) unsigned int) tail[1];

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 3, sig_done, &sp);
         __wait_for_all(&sp);
         return;

    start_offset_4:
         // Get top 4 bytes from head.
         dram_write_regs[0] = (__declspec(dram) unsigned int) head[0];
         dram_write_regs[1] = v->i[0];
         dram_write_regs[2] = v->i[1];
         dram_write_regs[3] = v->i[2];
         dram_write_regs[4] = v->i[3];
         dram_write_regs[5] = (__declspec(dram) unsigned int) tail[1];

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 3, sig_done, &sp);
         __wait_for_all(&sp);
         return;

   start_offset_6:
         // Get top 6 bytes from head, fill in last 6 bytes from tail
         dram_write_regs[0] = (__declspec(dram) unsigned int) head[0];
         dram_write_regs[1] = ((__declspec(dram) unsigned int)(head[1] & 0xFFFF0000)) | (v->i[0] >> 16);
         dram_write_regs[2] = dbl_shr(v->i[0], v->i[1], 16);
         dram_write_regs[3] = dbl_shr(v->i[1], v->i[2], 16);
         dram_write_regs[4] = dbl_shr(v->i[2], v->i[3], 16);
         dram_write_regs[5] = ((v->i[3]<<16) & 0xffff0000) | ((__declspec(dram) unsigned int) (0xffff & (tail[1])));

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 3, sig_done, &sp);
         __wait_for_all(&sp);
         return;
}
// Perform a 2W (8B) write to DRAM where the address is not necessarily aligned
__forceinline void onl_api_ua_write_2W_dram(unsigned int addr, void *val)
{
   __declspec(dram_read_reg) unsigned int dram_readhead_regs[2], dram_readtail_regs[2];
   __declspec(dram_write_reg) unsigned int dram_write_regs[4];
   SIGNAL_PAIR sp, sp2;
   sixteenlong *v;
   unsigned int addr_tail;
   unsigned tail[2];
   unsigned head[2];


   addr_tail = (unsigned int)addr + 8;

   if ((addr & 7) != 0) {
      dram_read(dram_readtail_regs, (__declspec(dram) void *)addr_tail, 1, sig_done, &sp);
      dram_read(dram_readhead_regs, (__declspec(dram) void *)addr, 1, sig_done, &sp2);
   }
   if ((addr & 7) != 0) {
       wait_for_all(&sp, &sp2);
       head[0] = dram_readhead_regs[0];
       head[1] = dram_readhead_regs[1];
       tail[0] = dram_readtail_regs[0];
       tail[1] = dram_readtail_regs[1];
   }

   v = (sixteenlong *) val;


   // Big endian only
   switch((unsigned int)addr & 7) {
      case 0: goto start_offset_0;
      case 2: goto start_offset_2;
      case 4: goto start_offset_4;
      case 6: goto start_offset_6;
      default: break;
   }
 
   start_offset_0:
         // We are aligned. Nothing special to do.
         dram_write_regs[0] = v->i[0];
         dram_write_regs[1] = v->i[1];

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 1, sig_done, &sp);
         __wait_for_all(&sp);
         return;


   start_offset_2:
         // Get top 2 bytes from head, and last 6 bytes come from tail
         dram_write_regs[0] = ((__declspec(dram) unsigned int)(head[0] & 0xFFFF0000)) |  (v->i[0] >> 16);
         dram_write_regs[1] = dbl_shr(v->i[0], v->i[1], 16);
         dram_write_regs[2] = ((v->i[1] << 16) & 0xffff0000) | ((__declspec(dram) unsigned int) (0xffff & tail[0]));
         dram_write_regs[3] = (__declspec(dram) unsigned int) tail[1];

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 2, sig_done, &sp);
         __wait_for_all(&sp);
         return;

    start_offset_4:
         // Get top 4 bytes from head.
         dram_write_regs[0] = (__declspec(dram) unsigned int) head[0];
         dram_write_regs[1] = v->i[0];
         dram_write_regs[2] = v->i[1];
         dram_write_regs[3] = (__declspec(dram) unsigned int) tail[1];

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 2, sig_done, &sp);
         __wait_for_all(&sp);
         return;

   start_offset_6:
         // Get top 6 bytes from head, fill in last 6 bytes from tail
         dram_write_regs[0] = (__declspec(dram) unsigned int) head[0];
         dram_write_regs[1] = ((__declspec(dram) unsigned int)(head[1] & 0xFFFF0000)) | (v->i[0] >> 16);
         dram_write_regs[2] = dbl_shr(v->i[0], v->i[1], 16);
         dram_write_regs[3] = ((v->i[1]<<16) & 0xffff0000) | ((__declspec(dram) unsigned int) (0xffff & (tail[1])));

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 2, sig_done, &sp);
         __wait_for_all(&sp);
         return;
}

// Perform a 1W (4B) write to DRAM where the address is not necessarily aligned
__forceinline void onl_api_ua_write_1W_dram(unsigned int addr, void *val)
{
   __declspec(dram_read_reg) unsigned int dram_readhead_regs[2], dram_readtail_regs[2];
   __declspec(dram_write_reg) unsigned int dram_write_regs[4];
   SIGNAL_PAIR sp, sp2;
   sixteenlong *v;
   unsigned int addr_tail;
   unsigned tail[2];
   unsigned head[2];


   addr_tail = (unsigned int)addr + 4;

   if ((addr & 7) != 0) {
       dram_read(dram_readhead_regs, (__declspec(dram) void *)addr, 1, sig_done, &sp2);
       if ((addr & 7) == 6) {
	  dram_read(dram_readtail_regs, (__declspec(dram) void *)addr_tail, 1, sig_done, &sp);
      }
   }
   if ((addr & 7) != 0) {
      if ((addr & 7) == 6) {
	  wait_for_all(&sp, &sp2);
	  head[0] = dram_readhead_regs[0];
	  head[1] = dram_readhead_regs[1];
	  tail[0] = dram_readtail_regs[0];
	  tail[1] = dram_readtail_regs[1];
      }
      else {
	  wait_for_all(&sp2);
	  head[0] = dram_readhead_regs[0];
	  head[1] = dram_readhead_regs[1];
      }
   }

   v = (sixteenlong *) val;


   // Big endian only
   switch((unsigned int)addr & 7) {
      case 0: goto start_offset_0;
      case 2: goto start_offset_2;
      case 4: goto start_offset_4;
      case 6: goto start_offset_6;
      default: break;
   }
 
   start_offset_0:
         // We are aligned. Nothing special to do.
         dram_write_regs[0] = v->i[0];
         dram_write_regs[1] = (__declspec(dram) unsigned int) head[1];

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 1, sig_done, &sp);
         __wait_for_all(&sp);
         return;


   start_offset_2:
         // Get top 2 bytes from head, and last 2 bytes from end of head
         dram_write_regs[0] = ((__declspec(dram) unsigned int)(head[0] & 0xFFFF0000)) |  (v->i[0] >> 16);
         dram_write_regs[1] = ((v->i[0] << 16) & 0xffff0000) | ((__declspec(dram) unsigned int) (0xffff & head[1]));

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 1, sig_done, &sp);
         __wait_for_all(&sp);
         return;

    start_offset_4:
         // Get top 4 bytes from head.
         dram_write_regs[0] = (__declspec(dram) unsigned int) head[0];
         dram_write_regs[1] = v->i[0];

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 1, sig_done, &sp);
         __wait_for_all(&sp);
         return;

   start_offset_6:
         // Get top 6 bytes from head, fill in last 6 bytes from tail
         dram_write_regs[0] = (__declspec(dram) unsigned int) head[0];
         dram_write_regs[1] = ((__declspec(dram) unsigned int)(head[1] & 0xFFFF0000)) | (v->i[0] >> 16);
         dram_write_regs[2] = ((v->i[0]<<16) & 0xffff0000) | ((__declspec(dram) unsigned int) (0xffff & (tail[0])));
         dram_write_regs[3] = (__declspec(dram) unsigned int) tail[1];

         dram_write(dram_write_regs, (__declspec(dram) void *) addr, 2, sig_done, &sp);
         __wait_for_all(&sp);
         return;
}

__forceinline void  onl_api_ua_read_1W_dram(unsigned int addr, void *val)
{

   __declspec(dram_read_reg) unsigned int dram_read_regs[4];
   SIGNAL_PAIR sp;
   onelong *v;
   
   v = (onelong *) val;


   if ((addr & 7) != 6) {
      dram_read(dram_read_regs, (__declspec(dram) void *)addr, 1, sig_done, &sp);
   }
   else {
      dram_read(dram_read_regs, (__declspec(dram) void *)addr, 2, sig_done, &sp);
   }
   __wait_for_all(&sp);

   // Big endian only
   switch((unsigned int)addr & 7) {
      case 0: goto start_offset_0;
      case 2: goto start_offset_2;
      case 4: goto start_offset_4;
      case 6: goto start_offset_6;
      default: break;
   }
 
   start_offset_0:

         v->i[0] = dram_read_regs[0];
         return;


   start_offset_2:
 
	 v->i[0] = (dram_read_regs[0] << 16) | (dram_read_regs[1] >> 16);
         return;

    start_offset_4:

         v->i[0] = dram_read_regs[1];
         return;

   start_offset_6:

	 v->i[0] = (dram_read_regs[1] << 16) | (dram_read_regs[2] >> 16);
         return;
}

__forceinline unsigned int onl_api_writeIpv4Hdr(unsigned int ipHdrPtr, __declspec(gp_reg) onl_api_ip_hdr *ipv4_hdr_ptr)
{
    onl_api_ua_write_20B_dram(ipHdrPtr, (void *) ipv4_hdr_ptr);
    return (1);
}

__forceinline unsigned int onl_api_writeTcpHdr(unsigned int tcpHdrPtr, __declspec(gp_reg) onl_api_tcp_hdr *tcp_hdr_ptr)
{
    onl_api_ua_write_20B_dram(tcpHdrPtr, (void *) tcp_hdr_ptr);
    return (1);
}

__forceinline unsigned int onl_api_writeUdpHdr(unsigned int udpHdrPtr, __declspec(gp_reg) onl_api_udp_hdr *udp_hdr_ptr)
{
    onl_api_ua_write_8B_dram(udpHdrPtr, (void *) udp_hdr_ptr);
    return (1);
}

__forceinline void  onl_api_ua_read_2W_dram(unsigned int addr, void *val)
{
   __declspec(dram_read_reg) unsigned int dram_read_regs[4];
   SIGNAL_PAIR sp;
   twolong *v;
   
   v = (twolong *) val;


   if ((addr & 7) == 0) {
      dram_read(dram_read_regs, (__declspec(dram) void *)addr, 1, sig_done, &sp);
   }
   else {
      dram_read(dram_read_regs, (__declspec(dram) void *)addr, 2, sig_done, &sp);
   }
   __wait_for_all(&sp);

   // Big endian only
   switch((unsigned int)addr & 7) {
      case 0: goto start_offset_0;
      case 2: goto start_offset_2;
      case 4: goto start_offset_4;
      case 6: goto start_offset_6;
      default: break;
   }
 
   start_offset_0:

         v->i[0] = dram_read_regs[0];
         v->i[1] = dram_read_regs[1];
         return;


   start_offset_2:
 
	 v->i[0] = (dram_read_regs[0] << 16) | (dram_read_regs[1] >> 16);
	 v->i[1] = (dram_read_regs[1] << 16) | (dram_read_regs[2] >> 16);
         return;

    start_offset_4:

         v->i[0] = dram_read_regs[1];
         v->i[1] = dram_read_regs[2];
         return;

   start_offset_6:

	 v->i[0] = (dram_read_regs[1] << 16) | (dram_read_regs[2] >> 16);
	 v->i[1] = (dram_read_regs[2] << 16) | (dram_read_regs[3] >> 16);
         return;
}

__forceinline void  onl_api_ua_read_4W_dram(unsigned int addr, void *val)
{
   __declspec(dram_read_reg) unsigned int dram_read_regs[6];
   SIGNAL_PAIR sp;
   fourlong *v;
   
   v = (fourlong *) val;


   if ((addr & 7) == 0) {
      dram_read(dram_read_regs, (__declspec(dram) void *)addr, 2, sig_done, &sp);
   }
   else {
      dram_read(dram_read_regs, (__declspec(dram) void *)addr, 3, sig_done, &sp);
   }
   __wait_for_all(&sp);

   // Big endian only
   switch((unsigned int)addr & 7) {
      case 0: goto start_offset_0;
      case 2: goto start_offset_2;
      case 4: goto start_offset_4;
      case 6: goto start_offset_6;
      default: break;
   }
 
   start_offset_0:

         v->i[0] = dram_read_regs[0];
         v->i[1] = dram_read_regs[1];
         v->i[2] = dram_read_regs[2];
         v->i[3] = dram_read_regs[3];

         return;


   start_offset_2:
 
	 v->i[0] = (dram_read_regs[0] << 16) | (dram_read_regs[1] >> 16);
	 v->i[1] = (dram_read_regs[1] << 16) | (dram_read_regs[2] >> 16);
	 v->i[2] = (dram_read_regs[2] << 16) | (dram_read_regs[3] >> 16);
	 v->i[3] = (dram_read_regs[3] << 16) | (dram_read_regs[4] >> 16);
         return;

    start_offset_4:

         v->i[0] = dram_read_regs[1];
         v->i[1] = dram_read_regs[2];
         v->i[2] = dram_read_regs[3];
         v->i[3] = dram_read_regs[4];
         return;

   start_offset_6:

	 v->i[0] = (dram_read_regs[1] << 16) | (dram_read_regs[2] >> 16);
	 v->i[1] = (dram_read_regs[2] << 16) | (dram_read_regs[3] >> 16);
	 v->i[2] = (dram_read_regs[3] << 16) | (dram_read_regs[4] >> 16);
	 v->i[3] = (dram_read_regs[4] << 16) | (dram_read_regs[5] >> 16);
         return;
}

__forceinline void  onl_api_ua_read_8W_dram(unsigned int addr, void *val)
{
   __declspec(dram_read_reg) unsigned int dram_read_regs[10];
   SIGNAL_PAIR sp;
   eightlong *v;
   
   v = (eightlong *) val;


   if ((addr & 7) == 0) {
      dram_read(dram_read_regs, (__declspec(dram) void *)addr, 4, sig_done, &sp);
   }
   else {
      dram_read(dram_read_regs, (__declspec(dram) void *)addr, 5, sig_done, &sp);
   }
   __wait_for_all(&sp);

   // Big endian only
   switch((unsigned int)addr & 7) {
      case 0: goto start_offset_0;
      case 2: goto start_offset_2;
      case 4: goto start_offset_4;
      case 6: goto start_offset_6;
      default: break;
   }
 
   start_offset_0:

         v->i[0] = dram_read_regs[0];
         v->i[1] = dram_read_regs[1];
         v->i[2] = dram_read_regs[2];
         v->i[3] = dram_read_regs[3];
         v->i[4] = dram_read_regs[4];
         v->i[5] = dram_read_regs[5];
         v->i[6] = dram_read_regs[6];
         v->i[7] = dram_read_regs[7];

         return;


   start_offset_2:
 
	 v->i[0] = (dram_read_regs[0] << 16) | (dram_read_regs[1] >> 16);
	 v->i[1] = (dram_read_regs[1] << 16) | (dram_read_regs[2] >> 16);
	 v->i[2] = (dram_read_regs[2] << 16) | (dram_read_regs[3] >> 16);
	 v->i[3] = (dram_read_regs[3] << 16) | (dram_read_regs[4] >> 16);
	 v->i[4] = (dram_read_regs[4] << 16) | (dram_read_regs[5] >> 16);
	 v->i[5] = (dram_read_regs[5] << 16) | (dram_read_regs[6] >> 16);
	 v->i[6] = (dram_read_regs[6] << 16) | (dram_read_regs[7] >> 16);
	 v->i[7] = (dram_read_regs[7] << 16) | (dram_read_regs[8] >> 16);
         return;

    start_offset_4:

         v->i[0] = dram_read_regs[1];
         v->i[1] = dram_read_regs[2];
         v->i[2] = dram_read_regs[3];
         v->i[3] = dram_read_regs[4];
         v->i[4] = dram_read_regs[5];
         v->i[5] = dram_read_regs[6];
         v->i[6] = dram_read_regs[7];
         v->i[7] = dram_read_regs[8];

         return;

   start_offset_6:

	 v->i[0] = (dram_read_regs[1] << 16) | (dram_read_regs[2] >> 16);
	 v->i[1] = (dram_read_regs[2] << 16) | (dram_read_regs[3] >> 16);
	 v->i[2] = (dram_read_regs[3] << 16) | (dram_read_regs[4] >> 16);
	 v->i[3] = (dram_read_regs[4] << 16) | (dram_read_regs[5] >> 16);
	 v->i[4] = (dram_read_regs[5] << 16) | (dram_read_regs[6] >> 16);
	 v->i[5] = (dram_read_regs[6] << 16) | (dram_read_regs[7] >> 16);
	 v->i[6] = (dram_read_regs[7] << 16) | (dram_read_regs[8] >> 16);
	 v->i[7] = (dram_read_regs[8] << 16) | (dram_read_regs[9] >> 16);

         return;
}

// Calculate TCP Packet Payload pointer
__forceinline unsigned int onl_api_getTcpPacketPayloadPtr(unsigned int tcpHdrPtr, unsigned int th_off)
{
	unsigned int tcpPacketPayloadPtr;
	tcpPacketPayloadPtr = tcpHdrPtr + (th_off << 2);
	return (tcpPacketPayloadPtr);
}

#define UDP_HEADER_SIZE_IN_BYTES 8
// Calculate UDP Packet Payload pointer
__forceinline unsigned int onl_api_getUdpPacketPayloadPtr(unsigned int udpHdrPtr)
{
	unsigned int udpPacketPayloadPtr;
	udpPacketPayloadPtr = udpHdrPtr + UDP_HEADER_SIZE_IN_BYTES;
	return (udpPacketPayloadPtr);
}

// 
// Retrieve Queue Params into a struct for a specified QID
__forceinline void onl_api_getQueueParams(unsigned int qid, __declspec(gp_reg) onl_api_qparams *qparams)
{
    SIGNAL sram_sig ;
    __declspec(sram_read_reg)  unsigned int sram_rd_regs[4];
    unsigned int qparamsPtr;
    qparamsPtr = QPARAMS_BASE_ADDR + ((0x0000FFFF & qid) * QPARAMS_UNIT_SIZE);

    sram_read(&sram_rd_regs, (__declspec(sram) void *) qparamsPtr, 4, ctx_swap, &sram_sig);

    qparams->value[0] = sram_rd_regs[0];
    qparams->value[1] = sram_rd_regs[1];
    qparams->value[2] = sram_rd_regs[2];
    qparams->value[3] = sram_rd_regs[3];

}

__forceinline unsigned int onl_api_ipv4Hdr_cksum16(__declspec(gp_reg) onl_api_ip_hdr *ipv4_hdr_ptr)
{
    unsigned int cksum;

    cksum  = ipv4_hdr_ptr->value[0] >> 16;
    cksum += ipv4_hdr_ptr->value[0] & 0xFFFF;
    cksum += ipv4_hdr_ptr->value[1] >> 16;
    cksum += ipv4_hdr_ptr->value[1] & 0xFFFF;
    cksum += ipv4_hdr_ptr->value[2] >> 16;
    //cksum  = ipv4_hdr_ptr->value[2] & 0xFFFF; // This is ip_sum, don't add it in.
    cksum += ipv4_hdr_ptr->value[3] >> 16;
    cksum += ipv4_hdr_ptr->value[3] & 0xFFFF;
    cksum += ipv4_hdr_ptr->value[4] >> 16;
    cksum += ipv4_hdr_ptr->value[4] & 0xFFFF;

    // We could have extra carry's depending on value of cksum
    while (cksum >> 16)
	cksum = (cksum & 0xFFFF) + (cksum >> 16);

    cksum = (~cksum & 0xFFFF);

    return(cksum);
}
__forceinline unsigned int onl_api_cksum16_1W(unsigned int cksum, void *values)
{   
   onelong *v;
   v = (onelong *) values;
   cksum += v->i[0] >> 16;
   cksum += v->i[0] & 0xFFFF;
   return(cksum);
}

__forceinline unsigned int onl_api_cksum16_2W(unsigned int cksum, void *values)
{   
   twolong *v;
   v = (twolong *) values;
   cksum += v->i[0] >> 16;
   cksum += v->i[0] & 0xFFFF;
   cksum += v->i[1] >> 16;
   cksum += v->i[1] & 0xFFFF;
   return(cksum);
}

__forceinline unsigned int onl_api_cksum16_4W(unsigned int cksum, void *values)
{   
   fourlong *v;
   v = (fourlong *) values;
   cksum += v->i[0] >> 16;
   cksum += v->i[0] & 0xFFFF;
   cksum += v->i[1] >> 16;
   cksum += v->i[1] & 0xFFFF;
   cksum += v->i[2] >> 16;
   cksum += v->i[2] & 0xFFFF;
   cksum += v->i[3] >> 16;
   cksum += v->i[3] & 0xFFFF;
   return(cksum);
}

__forceinline unsigned int onl_api_cksum16_8W(unsigned int cksum, void *values)
{   
   eightlong *v;
   v = (eightlong *) values;
   cksum += v->i[0] >> 16;
   cksum += v->i[0] & 0xFFFF;
   cksum += v->i[1] >> 16;
   cksum += v->i[1] & 0xFFFF;
   cksum += v->i[2] >> 16;
   cksum += v->i[2] & 0xFFFF;
   cksum += v->i[3] >> 16;
   cksum += v->i[3] & 0xFFFF;
   cksum += v->i[4] >> 16;
   cksum += v->i[4] & 0xFFFF;
   cksum += v->i[5] >> 16;
   cksum += v->i[5] & 0xFFFF;
   cksum += v->i[6] >> 16;
   cksum += v->i[6] & 0xFFFF;
   cksum += v->i[7] >> 16;
   cksum += v->i[7] & 0xFFFF;
   return(cksum);
}

__forceinline unsigned int onl_api_cksum16_16W(unsigned int cksum, void *values)
{   
   sixteenlong *v;
   v = (sixteenlong *) values;
   cksum += v->i[0] >> 16;
   cksum += v->i[0] & 0xFFFF;
   cksum += v->i[1] >> 16;
   cksum += v->i[1] & 0xFFFF;
   cksum += v->i[2] >> 16;
   cksum += v->i[2] & 0xFFFF;
   cksum += v->i[3] >> 16;
   cksum += v->i[3] & 0xFFFF;
   cksum += v->i[4] >> 16;
   cksum += v->i[4] & 0xFFFF;
   cksum += v->i[5] >> 16;
   cksum += v->i[5] & 0xFFFF;
   cksum += v->i[6] >> 16;
   cksum += v->i[6] & 0xFFFF;
   cksum += v->i[7] >> 16;
   cksum += v->i[7] & 0xFFFF;
   cksum += v->i[8] >> 16;
   cksum += v->i[8] & 0xFFFF;
   cksum += v->i[9] >> 16;
   cksum += v->i[9] & 0xFFFF;
   cksum += v->i[10] >> 16;
   cksum += v->i[10] & 0xFFFF;
   cksum += v->i[11] >> 16;
   cksum += v->i[11] & 0xFFFF;
   cksum += v->i[12] >> 16;
   cksum += v->i[12] & 0xFFFF;
   cksum += v->i[13] >> 16;
   cksum += v->i[13] & 0xFFFF;
   cksum += v->i[14] >> 16;
   cksum += v->i[14] & 0xFFFF;
   cksum += v->i[15] >> 16;
   cksum += v->i[15] & 0xFFFF;
   return(cksum);
}

__forceinline unsigned int onl_api_cksum16_final(unsigned int cksum)
{
    unsigned int localCksum;

    localCksum = cksum;
    while (localCksum >> 16)
	localCksum = (localCksum & 0xFFFF) + (localCksum >> 16);

    localCksum = (~localCksum) & 0x0000FFFF;
    return(localCksum);
}
typedef struct _sixteenWordsOfData
{
    unsigned int i[16];
} sixteenWordsOfData;


__forceinline unsigned int onl_api_udp_cksum(__declspec(gp_reg) onl_api_ip_hdr *ipv4_hdr, __declspec(gp_reg) onl_api_udp_hdr *udp_hdr, unsigned int udpHdrPtr)
{
    sixteenWordsOfData payloadData;
    unsigned int udpCksum;
    unsigned int dataPtr;
    unsigned int udpLength;

    // Zero out the first 9 bytes of the payload Data (9B)
    // Copy proto field from ip header in next (1B)
    // Copy UDP Length next (2B)
    // Copy ip header src addr (4B)
    // Copy ip header dst addr (4B)
    // Total of 20 Bytes so far, 5 Words
    // Copy UDP Header (8B)
    // Zero out rest of payload Data
    // Do cksum16_8W
    // while more udp payload
    // read 8W chunk of UDP payload and do cksum

    udpCksum = 0;
    udpLength = udp_hdr->uh_ulen;

    payloadData.i[0] = 0;
    payloadData.i[1] = 0;
    payloadData.i[2] = (ipv4_hdr->ip_proto << 16) | (udpLength);
    payloadData.i[3] = ipv4_hdr->ip_src;
    payloadData.i[4] = ipv4_hdr->ip_dst;
    payloadData.i[5] = udp_hdr->value[0];
    payloadData.i[6] = (udp_hdr->value[1] & 0xFFFF0000); // take out the udp cksum
    payloadData.i[7] = 0;

    udpCksum = onl_api_cksum16_8W(udpCksum, (void *) &payloadData);

    udpLength -= 8;
    dataPtr = udpHdrPtr + 8;

    while (udpLength >= 32) {
	onl_api_ua_read_8W_dram(dataPtr,  (void *) &payloadData);
	udpCksum = onl_api_cksum16_8W(udpCksum, (void *) &payloadData);
	udpLength -= 32;
	dataPtr += 32;
    }

    while (udpLength >= 16) {
	onl_api_ua_read_4W_dram(dataPtr,  (void *) &payloadData);
	udpCksum = onl_api_cksum16_4W(udpCksum, (void *) &payloadData);
	udpLength -= 16;
	dataPtr += 16;
    }

    while (udpLength >= 8) {
	onl_api_ua_read_2W_dram(dataPtr,  (void *) &payloadData);
	udpCksum = onl_api_cksum16_2W(udpCksum, (void *) &payloadData);
	udpLength -= 8;
	dataPtr += 8;
    }

    switch (udpLength) {
    case 0:
	// We are done!
	break;
    case 1:
	onl_api_ua_read_1W_dram(dataPtr,  (void *) &payloadData);
	payloadData.i[0] = payloadData.i[0] & 0xFF000000;
	udpCksum = onl_api_cksum16_1W(udpCksum, (void *) &payloadData);
	break;
    case 2:
	onl_api_ua_read_1W_dram(dataPtr,  (void *) &payloadData);
	payloadData.i[0] = payloadData.i[0] & 0xFFFF0000;
	udpCksum = onl_api_cksum16_1W(udpCksum, (void *) &payloadData);
	break;
    case 3:
	onl_api_ua_read_1W_dram(dataPtr,  (void *) &payloadData);
	payloadData.i[0] = payloadData.i[0] & 0xFFFFFF00;
	udpCksum = onl_api_cksum16_1W(udpCksum, (void *) &payloadData);
	break;
    case 4:
	onl_api_ua_read_1W_dram(dataPtr,  (void *) &payloadData);
	udpCksum = onl_api_cksum16_1W(udpCksum, (void *) &payloadData);
	break;
    case 5:
	onl_api_ua_read_2W_dram(dataPtr,  (void *) &payloadData);
	payloadData.i[1] = payloadData.i[1] & 0xFF000000;
	udpCksum = onl_api_cksum16_2W(udpCksum, (void *) &payloadData);
	break;
    case 6:
	onl_api_ua_read_2W_dram(dataPtr,  (void *) &payloadData);
	payloadData.i[1] = payloadData.i[1] & 0xFFFF0000;
	udpCksum = onl_api_cksum16_2W(udpCksum, (void *) &payloadData);
	break;
    case 7:
	onl_api_ua_read_2W_dram(dataPtr,  (void *) &payloadData);
	payloadData.i[1] = payloadData.i[1] & 0xFFFFFF00;
	udpCksum = onl_api_cksum16_2W(udpCksum, (void *) &payloadData);
	break;
    default:
	// We are done or we screwed up...
	break;
    }
    udpCksum = onl_api_cksum16_final(udpCksum);

    return(udpCksum);
}


__forceinline unsigned int onl_api_tcp_cksum(__declspec(gp_reg) onl_api_ip_hdr *ipv4_hdr, __declspec(gp_reg) onl_api_tcp_hdr *tcp_hdr, unsigned int tcpHdrPtr)
{
    sixteenWordsOfData payloadData;
    unsigned int tcpCksum;
    unsigned int dataPtr;
    unsigned int tcpLength;

    // Zero out the first 9 bytes of the payload Data (9B)
    // Copy proto field from ip header in next (1B)
    // Copy TCP Length next (2B)
    // Copy ip header src addr (4B)
    // Copy ip header dst addr (4B)
    // Total of 20 Bytes so far, 5 Words
    // Copy TCP Header (8B)
    // Zero out rest of payload Data
    // Do cksum16_8W
    // while more tcp payload
    // read 8W chunk of TCP payload and do cksum

    tcpCksum = 0;
    tcpLength = ipv4_hdr->ip_len - 20;

    payloadData.i[0] = 0;
    payloadData.i[1] = 0;
    payloadData.i[2] = (ipv4_hdr->ip_proto << 16) | (tcpLength);
    payloadData.i[3] = ipv4_hdr->ip_src;
    payloadData.i[4] = ipv4_hdr->ip_dst;
    payloadData.i[5] = tcp_hdr->value[0];
    payloadData.i[6] = tcp_hdr->value[1];
    payloadData.i[7] = tcp_hdr->value[2];
    payloadData.i[8] = tcp_hdr->value[3];
    payloadData.i[9] = (tcp_hdr->value[4] & 0x0000FFFF);  // take out the checksum
    payloadData.i[10] = 0;
    payloadData.i[11] = 0;
    payloadData.i[12] = 0;
    payloadData.i[13] = 0;
    payloadData.i[14] = 0;
    payloadData.i[15] = 0;

    tcpCksum = onl_api_cksum16_16W(tcpCksum, (void *) &payloadData);

    tcpLength -= 20;
    dataPtr = tcpHdrPtr + 20;

    while (tcpLength >= 32) {
	onl_api_ua_read_8W_dram(dataPtr,  (void *) &payloadData);
	tcpCksum = onl_api_cksum16_8W(tcpCksum, (void *) &payloadData);
	tcpLength -= 32;
	dataPtr += 32;
    }

    while (tcpLength >= 16) {
	onl_api_ua_read_4W_dram(dataPtr,  (void *) &payloadData);
	tcpCksum = onl_api_cksum16_4W(tcpCksum, (void *) &payloadData);
	tcpLength -= 16;
	dataPtr += 16;
    }

    while (tcpLength >= 8) {
	onl_api_ua_read_2W_dram(dataPtr,  (void *) &payloadData);
	tcpCksum = onl_api_cksum16_2W(tcpCksum, (void *) &payloadData);
	tcpLength -= 8;
	dataPtr += 8;
    }

    switch (tcpLength) {
    case 0:
	// We are done!
	break;
    case 1:
	onl_api_ua_read_1W_dram(dataPtr,  (void *) &payloadData);
	payloadData.i[0] = payloadData.i[0] & 0xFF000000;
	tcpCksum = onl_api_cksum16_1W(tcpCksum, (void *) &payloadData);
	break;
    case 2:
	onl_api_ua_read_1W_dram(dataPtr,  (void *) &payloadData);
	payloadData.i[0] = payloadData.i[0] & 0xFFFF0000;
	tcpCksum = onl_api_cksum16_1W(tcpCksum, (void *) &payloadData);
	break;
    case 3:
	onl_api_ua_read_1W_dram(dataPtr,  (void *) &payloadData);
	payloadData.i[0] = payloadData.i[0] & 0xFFFFFF00;
	tcpCksum = onl_api_cksum16_1W(tcpCksum, (void *) &payloadData);
	break;
    case 4:
	onl_api_ua_read_1W_dram(dataPtr,  (void *) &payloadData);
	tcpCksum = onl_api_cksum16_1W(tcpCksum, (void *) &payloadData);
	break;
    case 5:
	onl_api_ua_read_2W_dram(dataPtr,  (void *) &payloadData);
	payloadData.i[1] = payloadData.i[1] & 0xFF000000;
	tcpCksum = onl_api_cksum16_2W(tcpCksum, (void *) &payloadData);
	break;
    case 6:
	onl_api_ua_read_2W_dram(dataPtr,  (void *) &payloadData);
	payloadData.i[1] = payloadData.i[1] & 0xFFFF0000;
	tcpCksum = onl_api_cksum16_2W(tcpCksum, (void *) &payloadData);
	break;
    case 7:
	onl_api_ua_read_2W_dram(dataPtr,  (void *) &payloadData);
	payloadData.i[1] = payloadData.i[1] & 0xFFFFFF00;
	tcpCksum = onl_api_cksum16_2W(tcpCksum, (void *) &payloadData);
	break;
    default:
	// We are done or we screwed up...
	break;
    }
    tcpCksum = onl_api_cksum16_final(tcpCksum);

    return(tcpCksum);
}

__forceinline void onl_api_update_ring_out_to_freelist(unsigned int buf_handle)
{
    ring_out.plugin_freelist_data_out.buf_handle_lo24 = (buf_handle & 0xFFFFFF);
}

__forceinline void onl_api_update_ring_out_to_qm(unsigned int buf_handle, 
				   unsigned int out_port, 
				   unsigned int qid, 
				   unsigned int l3_pkt_len)
{
    ring_out.plugin_qm_data_out.buf_handle_lo24   = (buf_handle & 0xFFFFFF);
    ring_out.plugin_qm_data_out.out_port     = (out_port & 0xF);
    ring_out.plugin_qm_data_out.qid          = (qid & 0xFFFF);
    ring_out.plugin_qm_data_out.l3_pkt_len   = (l3_pkt_len & 0xFFFF);
}

__forceinline void onl_api_update_ring_out_to_mux(unsigned int buf_handle, 
				    unsigned int out_port, 
				    unsigned int in_port, 
				    unsigned int plugin_tag, 
				    unsigned int statsIndex, 
				    unsigned int flags, 
				    unsigned int qid, 
				    unsigned int l3_pkt_len)
{
    ring_out.plugin_mux_data_out.buf_handle_lo24 = (buf_handle & 0xFFFFFF);
    ring_out.plugin_mux_data_out.out_port     = (out_port & 0xF);
    ring_out.plugin_mux_data_out.in_port      = (in_port & 0xF);
    ring_out.plugin_mux_data_out.plugin_tag   = (plugin_tag & 0xF);
    ring_out.plugin_mux_data_out.stats_index   = (statsIndex & 0xFFFF);
    ring_out.plugin_mux_data_out.flags        = (flags & 0xFF);
    ring_out.plugin_mux_data_out.qid          = (qid & 0xFFFF);
    ring_out.plugin_mux_data_out.l3_pkt_len   = (l3_pkt_len & 0xFFFF);
}

__forceinline void onl_api_update_ring_out_to_xscale(unsigned int buf_handle, 
				       unsigned int out_port, 
				       unsigned int in_port, 
                                       unsigned int plugin_tag, 
				       unsigned int statsIndex, 
				       unsigned int flags, 
				       unsigned int qid, 
				       unsigned int nh_eth_daddr_hi32, 
				       unsigned int nh_eth_daddr_lo16, 
				       unsigned int eth_type, 
				       unsigned int uc_mc_bits, 
                                       unsigned int l3_pkt_len)
{
    // ring out to xscale is identical to ring out to plugin
    ring_out.plugin_xscale_data_out.buf_handle_lo24   = (buf_handle & 0xFFFFFF);
    //ring_out.plugin_xscale_data_out.out_port          = (out_port & 0xF);
    ring_out.plugin_xscale_data_out.in_port           = (in_port & 0xF);
    ring_out.plugin_xscale_data_out.plugin_tag        = (plugin_tag & 0xF);
    ring_out.plugin_xscale_data_out.stats_index        = (statsIndex & 0xFFFF);
    ring_out.plugin_xscale_data_out.flags             = (flags & 0xFF);
    ring_out.plugin_xscale_data_out.qid               = (qid & 0xFFFF);
    ring_out.plugin_xscale_data_out.nh_eth_daddr_hi32 = nh_eth_daddr_hi32;
    ring_out.plugin_xscale_data_out.nh_eth_daddr_lo16 = (nh_eth_daddr_lo16 & 0xFFFF);
    ring_out.plugin_xscale_data_out.eth_type          = (eth_type & 0xFFFF);
    ring_out.plugin_xscale_data_out.uc_mc_bits        = (uc_mc_bits & 0xFFFF);
    ring_out.plugin_xscale_data_out.l3_pkt_len        = (l3_pkt_len & 0xFFFF);
}

__forceinline void onl_api_update_ring_out_to_plugin(unsigned int buf_handle, 
				       unsigned int out_port, 
				       unsigned int in_port, 
                                       unsigned int plugin_tag, 
				       unsigned int statsIndex, 
				       unsigned int flags, 
				       unsigned int qid, 
				       unsigned int nh_eth_daddr_hi32, 
				       unsigned int nh_eth_daddr_lo16, 
				       unsigned int eth_type, 
				       unsigned int uc_mc_bits, 
                                       unsigned int l3_pkt_len)
{
    // ring out to plugin is identical to ring out to xscale
    ring_out.plugin_plugin_data_out.buf_handle_lo24   = (buf_handle & 0xFFFFFF);
    //ring_out.plugin_plugin_data_out.out_port          = (out_port & 0xF);
    ring_out.plugin_plugin_data_out.in_port           = (in_port & 0xF);
    ring_out.plugin_plugin_data_out.plugin_tag        = (plugin_tag & 0xF);
    ring_out.plugin_plugin_data_out.stats_index        = (statsIndex & 0xFFFF);
    ring_out.plugin_plugin_data_out.flags             = (flags & 0xFF);
    ring_out.plugin_plugin_data_out.qid               = (qid & 0xFFFF);
    ring_out.plugin_plugin_data_out.nh_eth_daddr_hi32 = nh_eth_daddr_hi32;
    ring_out.plugin_plugin_data_out.nh_eth_daddr_lo16 = (nh_eth_daddr_lo16 & 0xFFFF);
    ring_out.plugin_plugin_data_out.eth_type          = (eth_type & 0xFFFF);
    ring_out.plugin_plugin_data_out.uc_mc_bits        = (uc_mc_bits & 0xFFFF);
    ring_out.plugin_plugin_data_out.l3_pkt_len        = (l3_pkt_len & 0xFFFF);
    
}
#endif	// _INLINE_PLUGIN_API_C
