/*
 * $RCSfile: ocsum.c,v $
 *
 * x-kernel v3.3
 *
 * Copyright (c) 1993,1991,1990,1996  Arizona Board of Regents
 *
 * $Log: ocsum.c,v $
 * Revision 1.2  1996/01/29 19:49:20  slm
 * Updated copyright and version.
 *
 * Revision 1.1  1995/07/28  21:41:44  slm
 * Initial revision
 *
 * Revision 1.8.1.3  1994/12/06  19:22:38  davidm
 * (inCkSum): removed extraneous line-feeds in xTrace* calls.
 *
 * Revision 1.8.1.2  1994/10/27  20:54:04  hkaram
 * Davids new Version
 *
 * Revision 1.8  1994/06/03  21:56:14  davidm
 * (ocsum): xAssert(count < 65536) was correct for 32 bit machines only.
 *
 * Revision 1.7  1994/04/23  02:10:12  davidm
 * (ocsum): Must be able to properly deal with count==0.
 *
 * Revision 1.6  1994/04/17  04:57:55  davidm
 * (ocsum): Extended branch-less code to support 64 bit architectures (should
 * 	work for at least the OSF/1 on the Alphas and the 64 bit Irix.
 * 	Also fixed the polarity of the "#ifndef intelx86" switch.
 *
 * Revision 1.5  1994/04/11  23:15:34  ho
 * Uses no-branching ocsum routine; faster on most architectures,
 * except the intelx86.  An ifdef controls which one gets used.
 *
 * Revision 1.4  1994/02/05  00:09:06  menze
 *   [ 1994/01/28          menze ]
 *   assert.h renamed to xk_assert.h
 *
 * Revision 1.3  1993/12/15  23:25:25  menze
 * Modifications from UMass:
 *    Fixed msgLen deadlock in a trace statement
 *    Zero-test in if statement made explicit to stop GCC
 *    from whining.
 */

#include "xk_debug.h"
#include "platform.h"
#include "xk_assert.h"
#include "msg.h"
/* #include "msg_internal.h"	/* XXX fixme */

#ifdef __STDC__

/* static */ bool cksum_helper(char *, long, void *);

#endif

/* 
 * This code is not really platform-independent.  It assumes an
 * architecture with 32-bit longs and 16-bit shorts.  A platform for
 * which this is not true can add its own version of the checksum code
 * to the pxk library and override these functions.
 */

int tracecksum = 0;

/*
 * ocsum -- return the 1's complement sum of the 'count' 16-bit words
 * pointed to by 'hdr'.  Move and add by 32 bits at a time.
 *
 * assumes first address is on an even address, aligned for a short
 * assumes next even address is aligned for u_long
 * assumes len is > 0
 *
 *  To test this, please be sure to use 1, 2, 3, 4, and higher buffer
 *  sizes, with alignment at all possible byte positions!
 *  See the cksum_helper routine, especially.
 */

#ifndef intelx86
u_short
ocsum(hdr, count)
u_short *hdr;
int     count;
{
/* 
 * ccsum.c - Highly optimized MIPS checksum function.
 * by Jonathan Kay, Computer Systems Lab, UCSD         4/2/93
 *
 * Version 2.0
 * Techniques and credits:
 *   Basic algorithm is 3-instruction inner loop sum by Peter Desnoyers.
 *   Full word-size reading as described in Usenix W'93 paper.
 *   Pipelined latency absoption technique as described in paper.
 *   Unrolling chosen through testing and examination of actual workload.
 *   Rewrite in 'C' without loss of performance suggested by Vernon Schryver.
 *   15% faster than version 1 ("Usenix version").
 *   150% faster than Ultrix 4.2A checksum routine.
 */

/*
 * Copyright (c) 1993 Regents of the University of California.
 * All rights reserved.
 *
 * Permission to use, copy, modify, and distribute this software and its
 * documentation for any purpose, without fee, and without written agreement is
 * hereby granted, provided that the above copyright notice and the following
 * paragraph appears in all copies of this software.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
 * EVENT SHALL THE REGENTS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

    u_long hilo, high;
    u_long w0, w1;
    union memptr {
	u_int *i;
	u_long *l;
	u_long u;
	u_short *s;
	u_char *c;
    } buf;
    u_long sum = 0;
#define HALF_REG_WIDTH  (8 * sizeof(u_long) / 2)
#define SHORTS_PER_LONG (sizeof(long)/sizeof(short))

    buf.s = hdr;

#ifdef xk_int64
    xAssert(count < (1L<<32));	/* longer messages can cause overflows */

#if 0
    /* 64-bit align */
    switch (buf.u & 0x6) {
        case 2: if (count <= 0) break; --count; sum += *buf.s++;
        case 4: if (count <= 0) break; --count; sum += *buf.s++;
        case 6: if (count <= 0) break; --count; sum += *buf.s++;
        case 0: break;
    } /* switch */
#else
    while (count > 0 && (buf.u & 0x6) != 0) {
	--count;
	sum += *buf.s++;
    } /* while */
#endif
#else
    xAssert(count < (1L<<16));	/* longer messages can cause overflows */

    /* 32-bit-align */
    if (buf.u & 0x2) {
	sum += *(buf.s++);
	--count;
    }
#endif /* xk_int64 */

    /*
     * long-aligned sum, 16 times unrolled.
     * Peter Desnoyers' unbelievable 3-instruction main loop.
     * The conditional is predicted false because if it is
     * true, it is well worth paying some I-cache misses
     * and if it is false, we care about latency and want
     * to get the minimum number of misses.
     */
    if (count >= (16 + 2) * SHORTS_PER_LONG PREDICT_FALSE) {
	w0 = buf.l[0];
	w1 = buf.l[1];
	high = hilo = 0;
	do {
	    count -= 16 * SHORTS_PER_LONG;
	    hilo += w0; high += w0 >> HALF_REG_WIDTH; w0 = buf.l[2];
	    hilo += w1; high += w1 >> HALF_REG_WIDTH; w1 = buf.l[3];
	    hilo += w0; high += w0 >> HALF_REG_WIDTH; w0 = buf.l[4];
	    hilo += w1; high += w1 >> HALF_REG_WIDTH; w1 = buf.l[5];
	    hilo += w0; high += w0 >> HALF_REG_WIDTH; w0 = buf.l[6];
	    hilo += w1; high += w1 >> HALF_REG_WIDTH; w1 = buf.l[7];
	    hilo += w0; high += w0 >> HALF_REG_WIDTH; w0 = buf.l[8];
	    hilo += w1; high += w1 >> HALF_REG_WIDTH; w1 = buf.l[9];

	    hilo += w0; high += w0 >> HALF_REG_WIDTH; w0 = buf.l[10];
	    hilo += w1; high += w1 >> HALF_REG_WIDTH; w1 = buf.l[11];
	    hilo += w0; high += w0 >> HALF_REG_WIDTH; w0 = buf.l[12];
	    hilo += w1; high += w1 >> HALF_REG_WIDTH; w1 = buf.l[13];
	    hilo += w0; high += w0 >> HALF_REG_WIDTH; w0 = buf.l[14];
	    hilo += w1; high += w1 >> HALF_REG_WIDTH; w1 = buf.l[15];
	    hilo += w0; high += w0 >> HALF_REG_WIDTH; w0 = buf.l[16];
	    hilo += w1; high += w1 >> HALF_REG_WIDTH; w1 = buf.l[17];
	    buf.l += 16;
	} while (count >= (16 + 2) * SHORTS_PER_LONG);
	hilo -= (high << HALF_REG_WIDTH);
	sum += hilo;
	sum += high;
    } /* if */
    high = hilo = 0;
    while (count >= SHORTS_PER_LONG) {
	count -= SHORTS_PER_LONG;
	w0 = *(buf.l++); hilo += w0; high += w0 >> HALF_REG_WIDTH;
    } /* while */
    hilo -= (high << HALF_REG_WIDTH);
    sum += hilo;
    sum += high;

    while (count > 0) {
	--count;
	sum += *(buf.s++);
    } /* while */

    /* add all shorts in a long together to get full sum */
#ifdef xk_int64
    sum = (sum & 0xffff) + ((sum>>16) & 0xffff) +
          ((sum>>32) & 0xffff) + ((sum>>48) & 0xffff);
#else
    sum = (sum & 0xffff) + (sum >> 16);
#endif /* xk_int64 */
    sum = (sum & 0xffff) + (sum >> 16);
    return sum;
#undef HALF_REG_WIDTH
#undef SHORTS_PER_LONG
} /* ocsum */

#else /* intelx86 */

u_short
ocsum(hdr, count)
u_short *hdr;
register int count;
{
    register u_long sum = 0;
    register u_long *lp;
    register u_long overflow = 0;
    
    if ((u_int)hdr & 0x2) {   /* get up to a mult of 4 address */
      sum += *hdr++;
      --count;
    }

    lp = (u_long*) hdr;
    while (count >= 16) {
	sum += *lp;
	if (sum < *lp++) overflow++;
	sum += *lp;
	if (sum < *lp++) overflow++;
	sum += *lp;
	if (sum < *lp++) overflow++;
	sum += *lp;
	if (sum < *lp++) overflow++;
	sum += *lp;
	if (sum < *lp++) overflow++;
	sum += *lp;
	if (sum < *lp++) overflow++;
	sum += *lp;
	if (sum < *lp++) overflow++;
	sum += *lp;
	if (sum < *lp++) overflow++;
	count -= 16;
    } /* while */
    while (count >= 2) {
	sum += *lp;
	if (sum < *lp++) overflow++;
	count -= 2;
    }
    if (count) {
	hdr = (u_short*) lp;
	sum += *hdr;
	if (sum < *hdr++) overflow++;
    }
    sum += overflow;
    if (sum < overflow) sum++;
    sum = (sum & 0xffff) + ((sum>>16) & 0xffff);
    sum = (sum & 0xffff) + ((sum>>16) & 0xffff);
    return sum;
}
#endif /* intelx86 */

#if 0
/*
 * ocsum_simple -- return the 1's complement sum of the 'count' 16-bit words
 * pointed to by 'hdr'.  
 */
u_short
ocsum_simple(hdr, count)
u_short *hdr;
int     count;
{
    register u_long acc = 0;
    
    while (count--) {
	acc += *hdr++;
	if (acc & 0xFFFF0000) {
	    /* Carry occurred -- wrap around */
	    acc &= 0xFFFF;
	    acc++;
	}
    }
    return acc & 0xFFFF;
}
#endif

/* Compute checksum over BUF and then add in checksum over message M. */
u_short
inCkSum(m, buf, buf_len)
Msg     *m;
u_short *buf;
int     buf_len;
{
    int     len, saved_len;
    MsgWalk cxt;
    bool    odd_addr;
    u_long  sum, psum;
    int     odd;
    u_char  *data;

    xTrace1(cksum, TR_DETAILED, "inCkSum: msg-len=%ld", msgLength(m));
    xAssert(! (buf_len % 2));

    /* checksum buffer */

    sum = ocsum(buf, buf_len / 2);

    xTrace1(cksum, TR_EVENTS, "buf checksum: %x", sum);

    /* checksum message */

    odd = 0;
    msgWalkInit(&cxt, m);
    while ((data = msgWalkNext(&cxt, &len)) != 0) {
	xTrace2(cksum, TR_DETAILED, "data at 0x%lx len %ld",
		(u_long)data, len);
#ifndef ENDIAN
	xError("Machine byte order unknown; cannot compute checksum");
#endif
	saved_len = len;

	/* add first byte if necessary to put DATA on an even address */

	odd_addr = (u_long)data & 0x1;
	psum = 0;
	if (odd_addr PREDICT_FALSE) {
#if ENDIAN == LITTLE
	    psum = *data++ << 8;
#else
	    psum = *data++;
#endif
	    --len;
	} /* if */

	if (len >= 2) {
	    psum += ocsum((u_short *)data, len / 2);
	    data += len & ~1;
	} /* if */

	/* add in last byte if there is one */
	if (len & 1) {
#if ENDIAN == LITTLE
	    psum += *data;
#else
	    psum += *data << 8;
#endif
	} /* if */
	psum = (psum & 0xffff) + ((psum >> 16) & 0xffff);
	psum = (psum & 0xffff) + ((psum >> 16) & 0xffff);

	/* swap bytes in the partial sum if necessary */
	if (odd_addr ^ odd) {
	    /* wrap possible overflow */
	    xAssert(!(psum & 0xffff0000));
	    psum = ((psum >> 8) & 0xff) | ((psum & 0xff) << 8);
	} /* if */
	/*
	 * Add partial sum to total sum and indicate whether an odd or
	 * even total number of bytes have been processed
	 */
	sum += psum;
	odd = odd ^ (saved_len & 1);
	xTrace1(cksum, TR_DETAILED, "inCkSum: sum=%x", sum);
    } /* while */
    msgWalkDone(&cxt);

    sum = (sum & 0xffff) + ((sum >> 16) & 0xffff);
    sum = (sum & 0xffff) + ((sum >> 16) & 0xffff);
    xTrace1(cksum, TR_EVENTS, "Total checksum: %x", sum);
    xAssert(!(sum >> 16));
    return ~sum & 0xffff;
} /* inCkSum */

			/*** end of ocsum.c ***/
