/*
 * tcp_output.c
 *
 * Derived from:
 *
 * Copyright (c) 1982, 1986, 1988, 1990, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by the University of
 *	California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)tcp_output.c	8.3 (Berkeley) 12/30/93
 *
 * Modified for x-kernel v3.3
 * Modifications Copyright (c) 1996,1991  Arizona Board of Regents
 *
 * $Revision: 1.3 $
 * $Date: 1996/06/14 21:28:21 $
 */

#include "xkernel.h"
#include "tcp_internal.h"

#ifndef XNETSIM
#define TCPOUTFLAGS 
#endif

#include "tcp_fsm.h"
#include "tcp_seq.h"
#include "tcp_timer.h"
#include "tcp_var.h"
#include "tcpip.h"
#include "tcp_debug.h"
#include "btcp.h"

/* #ifdef notyet */
/* extern struct mbuf *m_copypack(); */
/* #endif */

#define MAX_TCPOPTLEN 32	/* max # bytes that go in options */

/*
 * Tcp output routine: figure out what should be sent and send it.
 */
int
btcp_output(tp)
register struct tcpcb *tp;
{
    register Sessn so = tp->t_inpcb->inp_session;
    register struct tcpstate *tcpst = sototcpst(so);
    register long len, win;
    int      off, flags, error;
#ifdef XNETSIM
    int      sndCnt = 0;
#endif
    struct tcphdr tHdr;
    u_char   opt[MAX_TCPOPTLEN];
    Msg      m;
    unsigned optlen, hdrlen, lv;
    int      idle, sendalot;
    PSTATE   *ps = (PSTATE *)so->myprotl->state;

    xTrace0(tcpp, 5, "tcp output");
    if (tcpst == NULL) {
        /*
         * Oops, looks like the socket closed on us.  Well, no need to do
	 * anymore output so... -mjk 8/16/90
         */
        xTrace0(tcpp, 5, "tcpst NULL -- btcp output exiting");
        return 0;
    }

#ifdef XNETSIM
    if (tp->v_other10 > 0) {
        printf("tcp_output returning because previous thread is blocked!\n");
        return 0;
    }
#endif
    DO_TRACE(ps, TCP_EVENT_OUT, tcpGetTime(), tp->t_state, tcpst->tid);
    /*
     * Determine length of data that should be transmitted, and flags that will
     * be used.  If there is some data or critical controls (SYN, RST) to send,
     * then transmit; otherwise, investigate further.
     */
    idle = (tp->snd_max == tp->snd_una);
    if (idle && tp->t_idle >= tp->t_rxtcur)
    	/*
    	 * We have been idle for "a while" and no acks are expected to clock
	 * out any data we send -- slow start to get ack "clock" running again.
    	 */
    	tp->snd_cwnd = tp->t_maxseg;
again:
#ifdef XNETSIM
    sndCnt++;
#endif
    sendalot = 0;
    off = tp->snd_nxt - tp->snd_una;

    xTrace2(tcpp, TR_MAJOR_EVENTS,
	    "btcp_output: tp->snd_wnd=%x, tp->snd_cwnd=%x",
	    tp->snd_wnd, tp->snd_cwnd);

    win = MIN(tp->snd_wnd, tp->snd_cwnd);

    DO_TRACE(ps, TCP_EVENT_OUT0, sblength(tcpst->snd), 
	     tp->snd_wnd>>4,tcpst->tid);
    DO_TRACE(ps, TCP_EVENT_DATA, tp->snd_nxt, tp->snd_cwnd>>4, tcpst->tid);
    DO_TRACE(ps, TCP_EVENT_DATA, tp->snd_una, tp->snd_ssthresh>>4, 
	     tcpst->tid);

    flags = tcp_outflags[tp->t_state];
    /*
     * If in persist timeout with window of 0, send 1 byte.  Otherwise, if
     * window is small but nonzero and timer expired, we will send what we can
     * and go to transmit state.
     */
    if (tp->t_force) {
    	if (win == 0) {
    	    /*
	     * If we still have some data to send, then clear the FIN bit.
	     * Usually this would happen below when it realizes that we aren't
	     * sending all the data.  However, if we have exactly 1 byte of
	     * unset data, then it won't clear the FIN bit below, and if we are
	     * in persist state, we wind up sending the packet without
	     * recording that we sent the FIN bit.
	     *
	     * We can't just blindly clear the FIN bit, because if we don't
	     * have any more data to send then the probe will be the FIN itself.
	     */
	    if (off < sblength(tcpst->snd))
	    	flags &= ~TH_FIN;
	    win = 1;
	}
	else {
	    tp->t_timer[TCPT_PERSIST] = 0;
	    tp->t_rxtshift = 0;
	}
    }

    len = MIN(sblength(tcpst->snd), win) - off;
    xTrace4(tcpp, 5, "btcp_output: sbLen: %d  win: %d  off: %d  len == %d",
	    sblength(tcpst->snd), win, off, len);

    if (len < 0) {
    	/*
    	 * If FIN has been sent but not acked, but we haven't been called to
	 * retransmit, len will be -1.  Otherwise, window shrank after we sent
	 * into it.  If window shrank to 0, cancel pending retransmit and pull
	 * snd_nxt back to (closed) window.  We will enter persist state below.
	 * If the window didn't close completely, just wait for an ACK.
    	 */
    	len = 0;
    	if (win == 0) {
	    tp->t_timer[TCPT_REXMT] = 0;
	    tp->snd_nxt = tp->snd_una;
    	}
    }
    if (len > tp->t_maxseg) {
    	len = tp->t_maxseg;
    	sendalot = 1;
    }
    if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + sblength(tcpst->snd)))
    	flags &= ~TH_FIN;

    win = tcpst->rcv_space;

    /*
     * Sender silly window avoidance.  If connection is idle and can send all
     * data, a maximum segment, at least a maximum default-size segment do it,
     * or are forced, do it; otherwise don't bother.  If peer's buffer is tiny,
     * then send when window is at least half open.  If retransmitting
     * (possibly after persist timer forced us to send into a small window),
     * then must resend.
     */
    if (len) {
    	if (len == tp->t_maxseg)
	    goto send;
    	if ((idle || tp->t_flags & TF_NODELAY) &&
	    len + off >= sblength(tcpst->snd))
	    goto send;
    	if (tp->t_force)
	    goto send;
    	if (len >= tp->max_sndwnd / 2)
	    goto send;
    	if (SEQ_LT(tp->snd_nxt, tp->snd_max))
	    goto send;
    }

    /*
     * Compare available window to amount of window known to peer (as
     * advertised window less next expected input).  If the difference is at
     * least two max size segments, or at least 50% of the maximum possible
     * window, then want to send a window update to peer.
     */
    if (win > 0) {
    	/* 
    	 * "adv" is the amount we can increase the window, taking into account
	 * that we are limited by TCP_MAXWIN << tp->rcv_scale.
    	 */
    	long adv = MIN(win, (long)TCP_MAXWIN << tp->rcv_scale) -
	               (tp->rcv_adv - tp->rcv_nxt);

    	if (adv >= (long) (2 * tp->t_maxseg))
	    goto send;
    	if (2 * adv >= (long) tcpst->rcv_hiwat)
	    goto send;
    }

    /* Send if we owe peer an ACK. */
    if (tp->t_flags & TF_ACKNOW)
    	goto send;
    if (flags & (TH_SYN|TH_RST))
    	goto send;
    if (SEQ_GT(tp->snd_up, tp->snd_una))
    	goto send;
    /*
     * If our state indicates that FIN should be sent and we have not yet done
     * so, or we're retransmitting the FIN, then we need to send.
     */
    if (flags & TH_FIN &&
        ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
    	goto send;

    /*
     * TCP window updates are not reliable, rather a polling protocol using
     * ``persist'' packets is used to insure receipt of window updates.  The
     * three ``states'' for the output side are:
     *	idle			not doing retransmits or persists
     *	persisting		to move a small or zero window
     *	(re)transmitting	and thereby not persisting
     *
     * tp->t_timer[TCPT_PERSIST]
     *	is set when we are in persist state.
     * tp->t_force
     *	is set when we are called to send a persist packet.
     * tp->t_timer[TCPT_REXMT]
     *	is set when we are retransmitting
     * The output side is idle when both timers are zero.
     *
     * If send window is too small, there is data to transmit, and no
     * retransmit or persist is pending, then go to persist state.  If nothing
     * happens soon, send when timer expires: if window is nonzero, transmit
     * what we can, otherwise force out a byte.
     */
    if (sblength(tcpst->snd) && tp->t_timer[TCPT_REXMT] == 0 &&
        tp->t_timer[TCPT_PERSIST] == 0) {
    	tp->t_rxtshift = 0;
    	tcp_setpersist(tp);
    }

    /* No reason to send a segment, just return. */
    xTrace0(tcpp, 5, "tcp_output -- no reason to send");
    return 0;

send:
    /*
     * Before ESTABLISHED, force sending of initial options unless TCP set not
     * to do any options.  NOTE: we assume that the IP/TCP header plus TCP
     * options always fit in a single mbuf, leaving room for a maximum link
     * header, i.e.
     *	max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MHLEN
     */
    optlen = 0;
    hdrlen = sizeof (struct tcpiphdr);
    if (flags & TH_SYN) {
    	tp->snd_nxt = tp->iss;	/* LSB: not present in prev vers */
    	if ((tp->t_flags & TF_NOOPT) == 0) {
	    u_short mss;

	    opt[0] = TCPOPT_MAXSEG;
	    opt[1] = 4;
/* 	    mss = htons((u_short) tcp_mss(tp)); <LSB> */
	    mss = MIN(tcpst->rcv_hiwat/2, tcp_mss(tp));
/*	    if (mss > IP_MSS - sizeof(struct tcpiphdr)) { */
		mss = htons(mss);			  
		bcopy((char *)&mss, (char *)(opt+2), sizeof(mss));
		optlen = 4;
/*	    } */
	 
	    if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 ||
		(tp->t_flags & TF_RCVD_SCALE))) {
/*		*((u_int *) (opt + optlen)) =  */
		lv = htonl(TCPOPT_NOP << 24 | TCPOPT_WINDOW << 16 |
			   TCPOLEN_WINDOW << 8 | tp->request_r_scale);
		bcopy((char *)&lv, (opt+optlen), sizeof(int));
		optlen += 4;
	    }
	}
     }
 
    /*
     * Send a timestamp and echo-reply if this is a SYN and our side wants to
     * use timestamps (TF_REQ_TSTMP is set) or both our side and our peer have
     * sent timestamps in our SYN's.
     */
    if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
        (flags & TH_RST) == 0 && ((flags & (TH_SYN|TH_ACK)) == TH_SYN ||
        (tp->t_flags & TF_RCVD_TSTMP))) {
	u_int lp[3];
 
 	/* Form timestamp option as shown in appendix A of RFC 1323. */
 	lp[0] = htonl(TCPOPT_TSTAMP_HDR);
 	lp[1] = htonl(ps->tcp_now);
 	lp[2] = htonl(tp->ts_recent);
       	bcopy((char *)lp, (opt+optlen), 3*sizeof(int));
	optlen += TCPOLEN_TSTAMP_APPA;
	xTrace2(tcpp,TR_MAJOR_EVENTS,"btcp_output: sending TS: (%d,%d)",
	        ps->tcp_now, tp->ts_recent);
    }

    hdrlen += optlen;
 
    /*
     * Adjust data length if insertion of options will bump the packet length
     * beyond the t_maxseg length.
     */
    if (len > tp->t_maxseg - optlen) {
    	len = tp->t_maxseg - optlen;
    	sendalot = 1;
    }


/* #ifdef DIAGNOSTIC */
/*    if (max_linkhdr + hdrlen > MHLEN) */
/* 	  panic("tcphdr too big"); */
/* #endif */

    /*
     * Grab a header mbuf, attaching a copy of data to be transmitted, and
     * initialize the header from the template for sends on this connection.
     */
    sbcollect(tcpst->snd, &m, off, len, 0);
    if (len) {
    	if (tp->t_force && len == 1)
	    ps->tcpstat.tcps_sndprobe++;
    	else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
	    DO_TRACE(ps, TCP_EVENT_DUP, 0, len, tcpst->tid);
	    ps->tcpstat.tcps_sndrexmitpack++;
	    ps->tcpstat.tcps_sndrexmitbyte += len;
    	}
	else {
	    ps->tcpstat.tcps_sndpack++;
	    ps->tcpstat.tcps_sndbyte += len;
    	}

    	/* LSB: stuff out */

    	/*
    	 * If we're sending everything we've got, set PUSH.  (This will keep
	 * happy those implementations which only give data to the user when a
	 * buffer fills or a PUSH comes in.)
    	 */
    	if (off + len == sblength(tcpst->snd))
	    flags |= TH_PUSH;
    }
    else {
    	if (tp->t_flags & TF_ACKNOW)
	    ps->tcpstat.tcps_sndacks++;
    	else if (flags & (TH_SYN|TH_FIN|TH_RST))
	    ps->tcpstat.tcps_sndctrl++;
    	else if (SEQ_GT(tp->snd_up, tp->snd_una))
    	    ps->tcpstat.tcps_sndurg++;
    	else
	    ps->tcpstat.tcps_sndwinup++;
    }
    if (tp->t_template == 0)
	Kabort("btcp_output");
    tHdr = tp->t_template->ti_t;

    /*
     * Fill in fields, remembering maximum advertised window for use in
     * delaying messages about window sizes.  If resending a FIN, be sure not
     * to use a new sequence number.
     */
    if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && 
	tp->snd_nxt == tp->snd_max)
    	tp->snd_nxt--;
    /*
     * If we are doing retransmissions, then snd_nxt will not reflect the first
     * unsent octet.  For ACK only packets, we do not want the sequence number
     * of the retransmitted packet, we want the sequence number of the next
     * unsent octet.  So, if there is no data (and no SYN or FIN), use snd_max
     * instead of snd_nxt when filling in ti_seq.  But if we are in persist
     * state, snd_max might reflect one byte beyond the right edge of the
     * window, so use snd_nxt in that case, since we know we aren't doing a
     * retransmission.  (retransmit and persist are mutually exclusive...)
     */
    if (len || (flags & (TH_SYN|TH_FIN)) || tp->t_timer[TCPT_PERSIST])
    	tHdr.th_seq = tp->snd_nxt;
    else
    	tHdr.th_seq = tp->snd_max;
    xTrace1(tcpp, 4, "Sending seq %d", tHdr.th_seq);
    tHdr.th_ack = tp->rcv_nxt;
    xTrace1(tcpp, 4, "Sending ack %d", tHdr.th_ack);
    xTrace3(tcpp, TR_MAJOR_EVENTS, "Sending seq %d, ack %d (len %d)",
            tHdr.th_seq, tHdr.th_ack, len );
    if (optlen) {
        int padOptLen = (optlen + 3) & ~0x3;
        void *buf;
 
        buf = msgPush(&m, padOptLen);
        xAssert(buf);
        tcpOptionsStore(opt, buf, padOptLen, &optlen);

	tHdr.th_off = (sizeof (struct tcphdr) + padOptLen) >> 2;
    }
#ifdef XNETSIM
    if (sndCnt >= 2)
        flags |= 0x80;        /* Used in simulator to study traffic and
                                 network behavior.  This flag set means that a
				 packet was sent immediately before this one */
    if (tp->snd_cwnd < tp->snd_ssthresh)
        flags |= 0x40;        /* Used in simulator to study traffic and
                                 network behavior.  This flag set means that we
				 are probably in slow-start mode */
#endif
    tHdr.th_flags = flags;
    /*
     * Calculate receive window.  Don't shrink window, but avoid silly window
     * syndrome.
     */
    if (win < (long)(tcpst->rcv_hiwat / 4) && win < (long)tp->t_maxseg)
    	win = 0;
    if (win > (long)TCP_MAXWIN << tp->rcv_scale)
    	win = (long)TCP_MAXWIN << tp->rcv_scale;
    if (win < (long)(tp->rcv_adv - tp->rcv_nxt))
    	win = (long)(tp->rcv_adv - tp->rcv_nxt);
    tHdr.th_win = (u_short) (win>>tp->rcv_scale);
    if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
    	tHdr.th_urp = (u_short)(tp->snd_up - tp->snd_nxt);
    	tHdr.th_flags |= TH_URG;
    }
    else
    	/*
    	 * If no urgent pointer to send, then we pull the urgent pointer to the
	 * left edge of the send window so that it doesn't drift into the send
	 * window on sequence number wraparound.
    	 */
    	tp->snd_up = tp->snd_una;		/* drag it along */

    DO_TRACE(ps, TCP_EVENT_OUT1, tHdr.th_seq, msgLength(&m), tcpst->tid);
    xTrace2(tcpp, 4, "btcp_outut: Sending %d bytes with flags ( %s )",
    	msgLength(&m), tcpFlagStr(tHdr.th_flags));
    {
    	hdrStore_t store;
    	void       *buf;
		
    	store.h = &tp->t_template->ti_p;
    	store.m = &m;
        buf = msgPush(&m, sizeof(struct tcphdr));
        xAssert(buf);
        tcpHdrStore(&tHdr, buf, sizeof(struct tcphdr), &store);
    }

    /*
     * In transmit state, time the transmission and arrange for the retransmit.
     * In persist state, just set snd_max.
     */
    if (tp->t_force == 0 || tp->t_timer[TCPT_PERSIST] == 0) {
    	tcp_seq startseq = tp->snd_nxt;

    	/* Advance snd_nxt over sequence space of this segment. */
    	if (flags & (TH_SYN|TH_FIN)) {
	    if (flags & TH_SYN)
	    	tp->snd_nxt++;
	    if (flags & TH_FIN) {
	    	tp->snd_nxt++;
	    	tp->t_flags |= TF_SENTFIN;
	    }
    	}
    	tp->snd_nxt += len;
    	if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
    	    tp->snd_max = tp->snd_nxt;
	    /*
	     * Time this transmission if not a retransmission and not currently
	     * timing anything.
	     */
	    if (tp->t_rtt == 0) {
	    	tp->t_rtt = 1;
	    	tp->t_rtseq = startseq;
	    	ps->tcpstat.tcps_segstimed++;
	    }
	}

    	/*
    	 * Set retransmit timer if not currently set, and not doing an ack or a
	 * keep-alive probe.  Initial value for retransmit timer is smoothed
    	 * round-trip time + 2 * round-trip time variance.  Initialize shift
	 * counter which is used for backoff of retransmit time.
    	 */
    	if (tp->t_timer[TCPT_REXMT] == 0 && tp->snd_nxt != tp->snd_una) {
	    tp->t_timer[TCPT_REXMT] = tp->t_rxtcur;
	    if (tp->t_timer[TCPT_PERSIST]) {
	    	tp->t_timer[TCPT_PERSIST] = 0;
	    	tp->t_rxtshift = 0;
	    }
	}
    }
    else
    	if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
	    tp->snd_max = tp->snd_nxt + len;

    /* Send it out. */
#ifdef XNETSIM
    tp->v_other10 = 1;
#endif
    error = xPush(xGetSessnDown(tcpcbtoso(tp), 0), &m) < 0;
#ifdef XNETSIM
    tp->v_other10 = 0;
#endif
    msgDestroy(&m);
    if (error) {
    	if (error == ENOBUFS)
    	    btcp_quench(tp->t_inpcb);
        return error;
    }
    ps->tcpstat.tcps_sndtotal++;

    /*
     * Data sent (as far as we can tell).  If this advertises a larger window
     * than any other segment, then remember the size of the advertised window.
     * Any pending ACK has now been sent.
     */
    if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) {
    	tp->rcv_adv = tp->rcv_nxt + win;
    	xTrace3(tcpp, 5, "rcv_adv = rcv_nxt (%x) + win (%x) = %x",
    		tp->rcv_nxt, win, tp->rcv_adv);
    }
    tp->last_ack_sent = tp->rcv_nxt;
    tp->t_flags &= ~(TF_ACKNOW|TF_DELACK);
    if (sendalot)
    	goto again;
    return 0;
}

#ifndef XNETSIM

void
tcp_setpersist(tp)
register struct tcpcb *tp;
{
    register t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;

    if (tp->t_timer[TCPT_REXMT])
    	Kabort("btcp_output REXMT");

    /* Start/restart persistance timer. */
    TCPT_RANGESET(tp->t_timer[TCPT_PERSIST], t * tcp_backoff[tp->t_rxtshift],
	          TCPTV_PERSMIN, TCPTV_PERSMAX);
    if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
    	tp->t_rxtshift++;
}

#endif
