/*
 * Copyright (c) 2001-2002 The Trustees of Indiana University.  
 *                         All rights reserved.
 * Copyright (c) 1998-2001 University of Notre Dame. 
 *                         All rights reserved.
 * Copyright (c) 1994-1998 The Ohio State University.  
 *                         All rights reserved.
 * 
 * This file is part of the LAM/MPI software package.  For license
 * information, see the LICENSE file in the top level directory of the
 * LAM/MPI source distribution.
 * 
 *	Ohio Trollius
 *	Copyright 1997 The Ohio State University
 *	NJN
 *
 *	$Id: rpi_ushm.c,v 1.4.2.2 2002/10/09 19:49:52 brbarret Exp $
 *
 *	Function:	- shared memory and TCP client-to-client interface
 */

#include <lam_config.h>
#include <sfh.h>

#include <errno.h>
#include <stdlib.h>
#include <fcntl.h>
#include <string.h>
#include <sys/time.h>				/* LINUX FD_SET etc. */
#include <sys/types.h>
#include <sys/socket.h>
#if HAVE_NETINET_TCP_H
#include <netinet/tcp.h>
#endif
#include <unistd.h>
#if LAM_HAVE_FIONBIO
#include <sys/ioctl.h>
#endif

#if LAM_NEED_SYS_SELECT_H
#include <sys/select.h>
#endif

#include <app_mgmt.h>
#include <blktype.h>
#include <dl_inet.h>
#include <mpi.h>
#include <mpisys.h>
#include <net.h>
#include <rpisys.h>
#include <terror.h>
#include <typical.h>
#include <t_types.h>
#if LAM_WANT_IMPI
#include <impi.h>
#endif

/*
 * public functions
 */ 
int			_rpi_c2c_init();
int			_rpi_c2c_addprocs();
int			_rpi_c2c_build();
int			_rpi_c2c_start();
int			_rpi_c2c_destroy();
int			_rpi_c2c_advance();
int			_rpi_c2c_iprobe();
int			_rpi_c2c_finalize();
int			_c2c_envl_cmp();
int			_c2c_comm_dead();
void			_c2c_fill_mpi_status();
void			_c2c_fill_wildcards();

/*
 * external functions
 */
extern int		sfh_sock_open_srv_inet_stm();
extern int		sfh_sock_open_clt_inet_stm();
extern int		sfh_sock_accept_tmout();
extern int		ldogetlinks();
extern int		_shm_advance();
extern int		_shmtcp_req_recv();
extern int		_shmtcp_req_probe();
extern int		_shm_req_send_long();
extern int		_shm_req_send_short();
extern int		_shm_req_send_synch();
extern int		_tcp_buffered_adv();
extern int		_tcp_req_send_long();
extern int		_tcp_req_send_short();
extern int		_tcp_req_send_synch();
extern int		_shm_create_area();
extern int		_shm_attach_area();
extern int		_shm_cleanup();
extern void		lam_commfault();

/*
 * private functions
 */
static void		add_read();
static void		add_read_any_src();
static void		add_write();
static int		send_to_self();
static int		send_to_self_match();
static int		connect_all();
static void		fill_sync();
static void		proc_init();
static int		finalize1();

/*
 * public variables
 */
int			_c2c_flblock;		/* blocking flag */
int			_c2c_haveadv;		/* have advanced? */

/*
 * external variables
 */
extern struct c2c_proc	**_shm_read;		/* reading shm processes */
extern struct c2c_proc	**_shm_write;		/* writing shm processes */
extern int		_shm_nprocs;		/* number of shm processes */
extern int		_shm_narray;		/* shm read/write array sizes */
extern int		_shm_nread;		/* # reading shm processes */
extern int		_shm_nwrite;		/* # writing shm processes */
extern int		_tcp_nio;		/* # processes doing tcp io */
extern int		_tcp_sockmax;		/* max. tcp io socket num. */
extern fd_set		_tcp_read;		/* read sockets */
extern fd_set		_tcp_write;		/* write sockets */
extern fd_set		_tcp_except;		/* exception sockets */
extern fd_set		_tcp_block;		/* blocked mode socket? */
extern fd_set		_tcp_eoferr;		/* eof on socket is error? */
extern MPI_Request	_tcp_lastreq;		/* last tcp request */
extern struct c2c_proc	*_tcp_smap[];		/* map socket to process */
extern int		_shm_poolsize;		/* size of long message pool */
extern int		_shm_maxalloc;		/* max allocation of shmem */

/*
 *	_rpi_c2c_init
 *
 *	Function:	- primary initialiation of RPI subsystem
 *			- initialize buffering and socket connections
 *	Returns		- 0 or LAMERROR
 */
int
_rpi_c2c_init()

{
	char		*env;			/* pointer into environemnt */
	int		myrank;
/*
 * Set from environment locking and polling control values.
 */
	if ((env = getenv("LAM_MPI_READLOCKPOLL"))) {
		_lock_poll_read = atoi(env);
	}
	if ((env = getenv("LAM_MPI_WRITELOCKPOLL"))) {
		_lock_poll_write = atoi(env);
	}
	if ((env = getenv("LAM_MPI_POLLYIELD"))) {
		_shm_poll_yield = atoi(env);
	}
	if ((env = getenv("LAM_MPI_SHMPOOLSIZE"))) {
	    _shm_poolsize = atoi(env);
	}
	if ((env = getenv("LAM_MPI_SHMMAXALLOC"))) {
	    _shm_maxalloc = atoi(env);
	}
	_shm_maxalloc = _shm_maxalloc >> LAM_LOG_ALIGN;

	if (getenv("LAM_MPI_PRINTENV")) {
	    myrank = lam_myproc->p_gps.gps_grank;
	    printf("%d: _lock_poll_read = %d\n", myrank, _lock_poll_read);
	    printf("%d: _lock_poll_write = %d\n", myrank, _lock_poll_write);
	    printf("%d: _shm_poll_yield = %d\n", myrank, _shm_poll_yield);
	    printf("%d: _shm_poolsize = %d\n", myrank, _shm_poolsize);
	    printf("%d: _shm_maxalloc = %d\n", myrank,
		_shm_maxalloc << LAM_LOG_ALIGN);
	}
/*
 * Initialize unexpected message buffering.
 */
	if (_cbuf_init()) return(LAMERROR);
/*
 * Set up all processes for client-to-client communication.
 */
	if (_rpi_c2c_addprocs()) return(LAMERROR);

	return(0);
}

/*
 *	_rpi_c2c_addprocs
 *
 *	Function:	- setup for new processes
 *			- makes socket connections
 *	Returns:	- 0 or LAMERROR
 */
int
_rpi_c2c_addprocs()

{
	return(connect_all());
}

/*
 *	_rpi_c2c_build
 *
 *	Function:	- builds RPI portion of a request from scratch
 *			- one-time cost separated from _rpi_c2c_start()
 *			  to optimize persistent requests
 *	Accepts:	- request
 *	Returns:	- 0 or LAMERROR
 */
int
_rpi_c2c_build(req)

MPI_Request		req;

{
	req->rq_rpi.c2c.cq_envbuf = (char *) &req->rq_rpi.c2c.cq_env;
	return(0);
}

/*
 *	_rpi_c2c_start
 *
 *	Function:	- initializes RPI dependent aspects of a request
 *			- cost per request start - separated from
 *			  _rpi_c2c_build() to optimize persistent requests
 *	Accepts:	- request list
 *			- request
 *	Returns:	- 0 or LAMERROR
 */
int
_rpi_c2c_start(req_top, req)

MPI_Request		req_top;
MPI_Request		req;

{
    struct cbuf_msg	*bmsg;			/* buffered message */

    if (commdead_m(req)) return(0);
/*
 * Set common synchronization and communication parameters. The peer in
 * a receive request may be a wildcard but will be set to the actual
 * peer upon matching with an incoming mesage.
 */
    req->rq_rpi.c2c.cq_peer = req->rq_rank;
    req->rq_rpi.c2c.cq_env.ce_flags = 0;
    req->rq_rpi.c2c.cq_env.ce_tag = req->rq_tag;
    req->rq_rpi.c2c.cq_env.ce_cid = req->rq_cid;
/*
 * receive specific initialization
 */
    if ((req->rq_type == LAM_RQIRECV) || (req->rq_type == LAM_RQIPROBE)) {
	req->rq_rpi.c2c.cq_state = C2CREAD;
	req->rq_rpi.c2c.cq_env.ce_rank = req->rq_rpi.c2c.cq_peer;
/*
 * Check for matching buffered envelope/message and advance according to
 * protocol with sender. The TCP protocol takes care of receiving from self.
 */
	if ((bmsg = _cbuf_find(&req->rq_rpi.c2c.cq_env))) {
	    _c2c_fill_wildcards(req, &bmsg->cm_env);
	    if (req->rq_proc->p_rpi.c2c.cp_inbox != 0) {
		return(_shm_buffered_adv(req, bmsg));
	    } else {
		return(_tcp_buffered_adv(req, bmsg));
	    }
	}
/*
 * Set function to advance request once a matching request is read in.
 * This function will demux the protocols.
 */
	req->rq_rpi.c2c.cq_adv = (req->rq_type == LAM_RQIRECV)
					? _shmtcp_req_recv : _shmtcp_req_probe;
    }
/*
 * send specific initialization
 */
    else {
	req->rq_rpi.c2c.cq_env.ce_rank = req->rq_comm->c_group->g_myrank;
	req->rq_rpi.c2c.cq_env.ce_seq = req->rq_seq;
	req->rq_rpi.c2c.cq_env.ce_len = req->rq_packsize;

	if (req->rq_proc == lam_myproc) {
/*
 * send to self
 */
	    return(send_to_self(req_top, req));
	}

	else {
/*
 * send to another process
 */
	    req->rq_rpi.c2c.cq_state = C2CWRITE;
	    req->rq_rpi.c2c.cq_msgbuf = req->rq_packbuf;

	    if (req->rq_proc->p_rpi.c2c.cp_inbox != 0) {
/*
 * using shared memory
 */
		if (req->rq_packsize > LAM_SHMSHORTMSGLEN) {
/*
 * long message protocol
 */
		    req->rq_rpi.c2c.cq_nmsgout = LAM_SHMSHORTMSGLEN;
		    req->rq_rpi.c2c.cq_nenvout = ENVSIZE;
		    req->rq_rpi.c2c.cq_env.ce_flags |= C2CLONG;
		    req->rq_rpi.c2c.cq_adv = _shm_req_send_long;

		} else {
/*
 * short message protocol
 */
		    req->rq_rpi.c2c.cq_nmsgout = req->rq_packsize;
		    req->rq_rpi.c2c.cq_nenvout = ENVSIZE;

		    if (req->rq_type == LAM_RQISSEND) {
			req->rq_rpi.c2c.cq_env.ce_flags |= C2CSSEND;
			req->rq_rpi.c2c.cq_adv = _shm_req_send_synch;
		    } else {
			req->rq_rpi.c2c.cq_adv = _shm_req_send_short;
		    }
		}
	    } else {
/*
 * using TCP
 */
		if (req->rq_packsize > LAM_TCPSHORTMSGLEN) {
/*
 * long message protocol
 */
		    req->rq_rpi.c2c.cq_env.ce_flags |= C2CLONG;
		    req->rq_rpi.c2c.cq_adv = _tcp_req_send_long;

		} else {
/*
 * short message protocol
 */
		    req->rq_rpi.c2c.cq_nmsgout = req->rq_packsize;

		    if (req->rq_type == LAM_RQISSEND) {
			req->rq_rpi.c2c.cq_env.ce_flags |= C2CSSEND;
			req->rq_rpi.c2c.cq_adv = _tcp_req_send_synch;
		    } else {
			req->rq_rpi.c2c.cq_adv = _tcp_req_send_short;
		    }
		}

		tcp_set_out_envelope_m(req->rq_rpi.c2c);
	    }
	}
    }

    return(0);
}

/*
 *	_rpi_c2c_destroy
 *
 *	Function:	- destroys RPI portion of request
 *	Accepts:	- request
 *	Returns:	- 0 or LAMERROR
 */
int
_rpi_c2c_destroy(req)

MPI_Request		req;

{
	return(0);
}

/*
 *	_rpi_c2c_advance
 *
 *	Function:	- advances requests in c2c mode
 *			- we try to advance all requests as far as possible
 *			  as allowed by RPI
 *	Accepts:	- request list
 *			- block enable flag
 *	Returns:	- 0 or LAMERROR
 */
int
_rpi_c2c_advance(req_top, fl_block)

MPI_Request		req_top;
int			fl_block;

{
	MPI_Request	req;			/* current request */
/*
 * Find which requests require IO.
 */
	FD_ZERO(&_tcp_read);
	FD_ZERO(&_tcp_write);
	FD_ZERO(&_tcp_except);
	FD_ZERO(&_tcp_eoferr);
	_c2c_flblock = fl_block;
	_c2c_haveadv = 0;
	_shm_nwrite = 0;
	_shm_nread = 0;
	_tcp_nio = 0;
	_tcp_sockmax = -1;

	for (req = req_top; req; req = req->rq_next) {
/*
 * Start requests that are in the init state.
 */
		if (req->rq_state == LAM_RQSINIT) {
			if (_mpi_req_start(req) != MPI_SUCCESS) {
				return(LAMERROR);
			}
		}
/*
 * If a blocking request is done we may no longer block.
 */
		if (req->rq_state == LAM_RQSDONE) {
			if (req->rq_flags & LAM_RQFBLOCK) {
				_c2c_flblock = 0;
			}
			continue;
		}

		if (commdead_m(req)) continue;

		if (req->rq_rpi.c2c.cq_state == C2CWRITE) {
			add_write(&req->rq_proc->p_rpi.c2c, req);
		}
		else if (req->rq_rpi.c2c.cq_state == C2CREAD) {
			if (req->rq_proc == 0) {
				add_read_any_src(req);
			} else {
				add_read(&req->rq_proc->p_rpi.c2c, req);
			}
		}
	}

	if (lam_ger && _c2c_flblock
			&& (_shm_nread + _shm_nwrite + _tcp_nio == 0)) {
		errno = EGERFLOW;
		return(LAMERROR);
	}

	return(_shm_advance());
}

/*
 *	_rpi_c2c_iprobe
 *
 *	Function:	- non-blocking probe
 *			- public interface for peculiar MPI_Iprobe() which
 *			  does not return a request to the user
 *	Accepts:	- request
 *	Returns:	- 0: no msg, 1: msg, -1: error
 */
int
_rpi_c2c_iprobe(req)

MPI_Request		req;

{
	int		err;			/* error code */
/*
 * Link the probe request and advance as far as possible.
 */
	_mpi_req_add_m(req);
	_mpi_req_blkclr_m();
	err = _mpi_req_advance();
	if (err != MPI_SUCCESS) return(-1);
/*
 * Unlink the request.
 */
	_mpi_req_rem_m(req);
/*
 * A message was found if the request is in the done state.
 */
	return((req->rq_state == LAM_RQSDONE) ? 1 : 0);
}

/*
 *	_rpi_c2c_finalize
 *
 *	Function:	- c2c cleanup
 *	Accepts:	- process to cleanup (0 => all processes)
 *	Returns:	- 0 or LAMERROR
 */
int
_rpi_c2c_finalize(p)

struct _proc		*p;

{
	if (p) {
		return(finalize1(p));
	} else {
/*
 * Clean up buffers.
 */
		_cbuf_end();
/*
 * Cleanup shared memory amd locks.
 */
		if (_shm_membase) {
			if (_shm_cleanup(&lam_myproc->p_rpi.c2c)) {
				return(LAMERROR);
			}
		}
/*
 * Loop through all processes closing connections.
 */
		for (p = lam_topproc(); p; p = lam_nextproc()) {
			if (finalize1(p)) {
				return(LAMERROR);
			}
		}		
	}

	return(0);
}

/*
 *	send_to_self
 *
 *	Function:	- advance send to self
 *	Accepts:	- request list
 *			- send request
 */
static int
send_to_self(req_top, send)

MPI_Request		req_top;
MPI_Request		send;

{
    MPI_Request		recv;			/* receive request */
    struct cbuf_msg	msg;			/* buffer list entry */
/*
 * Look for inactive matching receive/probe and advance if found.
 */
    for (recv = req_top; recv; recv = recv->rq_next) {

	if ((recv->rq_state == LAM_RQSSTART)
		&& (recv->rq_rpi.c2c.cq_state == C2CREAD)
		&& (!_c2c_envl_cmp(
		    &send->rq_rpi.c2c.cq_env, &recv->rq_rpi.c2c.cq_env))) {

	    if (send_to_self_match(send, recv)) {
		return(0);
	    }
	}
    }
/*
 * No matching receive found, buffer the whole message and the send is
 * done unless its a synchronous send in which case we use the user
 * buffer and the send only completes once a matching receive is posted.  
 */
    msg.cm_env = send->rq_rpi.c2c.cq_env;
    msg.cm_proc = 0;
 
    if (send->rq_type == LAM_RQISSEND) {
	send->rq_rpi.c2c.cq_state = C2CSENDSELF;
	msg.cm_buf = send->rq_packbuf;
	msg.cm_req = send;
    }
    else {
	if (send->rq_packsize > 0) {
	    if ((msg.cm_buf = (char *) malloc(send->rq_packsize)) == 0) {
		return(LAMERROR);
	    }
	    memcpy(msg.cm_buf, send->rq_packbuf, send->rq_packsize);
	} else {
	    msg.cm_buf = 0;
	}

	msg.cm_req = 0;
	send->rq_state = LAM_RQSDONE;
	lam_rq_nactv--;
   }

    return(_cbuf_append(&msg) ? 0 : LAMERROR);
}

/*
 *	send_to_self_match
 *
 *	Function:	- advance send and matching receive/probe
 *	Accepts:	- send request
 *			- receive/probe request
 *	Returns:	- 1: matched a receive, 0: matched a probe
 */
static int
send_to_self_match(send, recv)

MPI_Request		send;
MPI_Request		recv;

{
    int			len;			/* # bytes to transfer */

    recv->rq_seq = send->rq_seq;
    if (recv->rq_type == LAM_RQIPROBE) {
/*
 * The receive is actually a probe so the send is not complete.
 */
	_c2c_fill_mpi_status(recv, send->rq_rpi.c2c.cq_env.ce_rank,
		send->rq_rpi.c2c.cq_env.ce_tag, send->rq_rpi.c2c.cq_env.ce_len);
		
	recv->rq_state = LAM_RQSDONE;
	lam_rq_nactv--;
	return(0);
    }
    else {
/*
 * It's really a receive. Do the data transfer.
 *
 * Check for mismatched message lengths.
 */
	if (send->rq_packsize > recv->rq_packsize) {
	    recv->rq_flags |= LAM_RQFTRUNC;
	    len = recv->rq_packsize;
	} else {
	    len = send->rq_packsize;
	}

	memcpy(recv->rq_packbuf, send->rq_packbuf, len);

	_c2c_fill_mpi_status(recv, send->rq_rpi.c2c.cq_env.ce_rank,
				send->rq_rpi.c2c.cq_env.ce_tag, len);

	send->rq_state = recv->rq_state = LAM_RQSDONE;
	lam_rq_nactv--;
	return(1);
    }
}

/*
 *	add_write
 *
 *	Function:	- add process to write advance list
 *	Accepts:	- process
 *			- writing request
 */
static void
add_write(ps, req)

struct c2c_proc		*ps;
MPI_Request		req;

{
	if (ps->cp_inbox != 0) {
/*
 * using shared memory
 */
		if (ps->cp_write) {
			return;
		}
/*
 * Associate request with process.
 */
		if (!ps->cp_wreq) {
			ps->cp_wreq = req;
		}
		
		ps->cp_write = 1;
		_shm_write[_shm_nwrite++] = ps;
	}
	else {
/*
 * using TCP
 */
		if (FD_ISSET(ps->cp_sock, &_tcp_write)) {
			return;
		}
/*
 * Associate request with process.
 */
		if (!ps->cp_wreq) {
			ps->cp_wreq = req;
		}

		_tcp_nio++;
		_tcp_lastreq = req;
		FD_SET(ps->cp_sock, &_tcp_write);
		FD_SET(ps->cp_sock, &_tcp_except);

		if (ps->cp_sock > _tcp_sockmax) {
			_tcp_sockmax = ps->cp_sock;
		}
	}
}

/*
 *	add_read
 *
 *	Function:	- add process to read advance list
 *			- do not add in case process is self
 *	Accepts:	- process
 *			- request to start matching from
 */
static void
add_read(ps, req)

struct c2c_proc		*ps;
MPI_Request		req;

{
	if (lam_ger && ps->cp_nbfde >= MPI_GER) {
		return;
	}

	if (ps->cp_inbox != 0) {
/*
 * using shared memory
 */
		if (ps->cp_read) {
			return;
		}

		ps->cp_read = 1;
		ps->cp_mreq = req;
		_shm_read[_shm_nread++] = ps;
	}
	else if (ps->cp_sock >= 0) {
/*
 * using TCP
 */
		if (FD_ISSET(ps->cp_sock, &_tcp_read)) {
			return;
		}

		_tcp_nio++;
		_tcp_lastreq = req;
		ps->cp_mreq = req;
		FD_SET(ps->cp_sock, &_tcp_read);
		FD_SET(ps->cp_sock, &_tcp_except);
    
		if (ps->cp_sock > _tcp_sockmax) {
			_tcp_sockmax = ps->cp_sock;
		}
	}
}

/*
 *	add_read_any_src
 *
 *	Function:	- add to the read advance list all processes in
 *			  the peer group of a receive request on MPI_ANY_SOURCE
 *	Accepts:	- request
 */
static void
add_read_any_src(req)

MPI_Request		req;

{
	struct _group	*g;			/* peer group */
	struct _proc	**p;
	int		i;

	g = (LAM_IS_INTER(req->rq_comm))
		? req->rq_comm->c_rgroup : req->rq_comm->c_group;

	for (i = g->g_nprocs, p = g->g_procs; i > 0; i--, p++) {
#if LAM_WANT_IMPI
/*
 * Special case for IMPI -- if we're receiving from the impid proxy
 * entry in the group, replace it with the One True IMPID Proc.
 */
	  if (LAM_GPSCMP(&(*p)->p_gps, &gimpid) == 0 && lam_impid_proc != 0)
	    add_read(&(lam_impid_proc->p_rpi.c2c), req);
	  else
	    add_read(&(*p)->p_rpi.c2c, req);
#else
	  add_read(&(*p)->p_rpi.c2c, req);
#endif
	}
}

/*
 *	_c2c_fill_wildcards
 *
 *	Function:	- replace wildcards in request with matched values
 *			  and fill in the sequence number
 *	Accepts:	- request
 *			- matched envelope
 */
void
_c2c_fill_wildcards(req, env)

MPI_Request		req;
struct c2c_envl		*env;

{
	struct _group	*g;			/* peer group */
		
	req->rq_seq = env->ce_seq;
    
	if (req->rq_rpi.c2c.cq_env.ce_tag == MPI_ANY_TAG) {
		req->rq_rpi.c2c.cq_env.ce_tag = env->ce_tag;
	}

	if (req->rq_rpi.c2c.cq_peer == MPI_ANY_SOURCE) {
		req->rq_rpi.c2c.cq_peer = env->ce_rank;
		req->rq_rpi.c2c.cq_env.ce_rank = env->ce_rank;
	
		g = (LAM_IS_INTER(req->rq_comm))
			? req->rq_comm->c_rgroup : req->rq_comm->c_group;

		req->rq_proc = g->g_procs[req->rq_rpi.c2c.cq_peer];
	}
}

/*
 *	_c2c_fill_mpi_status
 *
 *	Function:	- fill in the MPI status object
 *	Accepts:	- request
 *			- rank
 *			- tag
 *			- message length
 */
void
_c2c_fill_mpi_status(req, rank, tag, length)

MPI_Request		req;
int			rank;
int			tag;
int			length;

{
	req->rq_status.MPI_SOURCE = rank;
	req->rq_status.MPI_TAG = tag;
	req->rq_status.st_length = length;
}

/*
 *	_c2c_envl_cmp
 *
 *	Function:	- check if envelopes match
 *			- second envelope may contain wildcards and first
 *			  may not
 *	Accepts:	- ptr to envelope
 *			- ptr to request envelope
 *	Returns:	- 0 if match, 1 if not
 */
int
_c2c_envl_cmp(pe, pq)

struct c2c_envl		*pe, *pq;

{
    if ((pe->ce_cid == pq->ce_cid)
	&& ((pe->ce_rank == pq->ce_rank) || (pq->ce_rank == MPI_ANY_SOURCE))
	&& ((pe->ce_tag == pq->ce_tag) || (pq->ce_tag == MPI_ANY_TAG))
	&& ((pe->ce_flags & C2CACK) == (pq->ce_flags & C2CACK))
	&& ((pe->ce_flags & C2C2ND) == (pq->ce_flags & C2C2ND))) {

	return(0);
    }

    return(1);
}

/*
 *	_c2c_comm_dead
 *
 *	Function:	- sets dead communicator error for request
 *	Accepts:	- request
 *	Returns:	- 1
 */
int
_c2c_comm_dead(req)

MPI_Request		req;

{
	if (req->rq_state != LAM_RQSDONE && req->rq_state != LAM_RQSINIT) {
		lam_rq_nactv--;
	}

	req->rq_state = LAM_RQSDONE;
	_c2c_haveadv = 1;

	if (req->rq_comm->c_flags & LAM_CLDEAD) {
		req->rq_status.MPI_ERROR = lam_mkerr(MPI_ERR_LOCALDEAD, 0);
	} else {
		req->rq_status.MPI_ERROR = lam_mkerr(MPI_ERR_REMOTEDEAD, 0);
	}

	return(1);
}

/*
 *	connect_all
 *
 *	Function:	- make tcp or shm connections to all other processes
 *	Returns:	- 0 or LAMERROR
 */
static int
connect_all()

{
    struct _proc	*p;
    struct _proc	*lowp = 0;		/* lowest proc. rank on node */
    struct _gps		*mygps;			/* my GPS */
    struct nmsg		inmsg;			/* incoming network msg hdr */
    struct nmsg		outmsg;			/* outgoing network msg hdr */
    struct dolink	*links;			/* links to neighbours */
    int4		nlinks;			/* number of links */
    int			sock;			/* socket descriptor */
    int			servsockd;		/* server socket descriptor */
    int			servport = 0;		/* server port number */
    int			rnode;			/* remote node */
    int			rport;			/* remote port */
    int			flag;			/* for setting socket opts */
    int			bufsize;		/* c2c socket buffer size */
    int			nshm = 0;		/* number shared mem procs */
    unsigned char	*raddr;			/* remote host address */

    mygps = &lam_myproc->p_gps;

    LAM_ZERO_ME(inmsg);
    LAM_ZERO_ME(outmsg);
    servsockd = -1;
    bufsize = LAM_TCPSHORTMSGLEN + ENVSIZE;

    if (lam_nprocs() > 1) {
/*
 * Get links to neighbours, initialize server socket, message headers, etc.
 */
	if (ldogetlinks(&links, &nlinks)) return(LAMERROR);

	servsockd = sfh_sock_open_srv_inet_stm(&servport);
	if (servsockd < 0) return(LAMERROR);
/*
 * Initialize initialization message headers.
 */
	inmsg.nh_flags = 0;
	inmsg.nh_length = 0;
	outmsg.nh_flags = 0;
	outmsg.nh_length = 0;
	outmsg.nh_data[0] = (int4) servport;
    }
/*
 * If this is the first initialization (i.e. we are in MPI_Init() and no
 * communicators have been initialized) then determine if the process is
 * the lowest ranking process on the node and if there are more than one
 * processes on this node.  If so then initialize the shared memory
 * area.  Currently we do not use shared memory to communicate with
 * processes on the same node which are spawned or connected to via
 * accept/connect/join.	 Such processes have the LAM_PCLIENT mode bit
 * set.	 
 */
    if (lam_comms == 0) {
	for (p = lam_topproc(); p; p = lam_nextproc()) {
	    if (p->p_gps.gps_node == mygps->gps_node
			&& !(p->p_mode & LAM_PCLIENT)) {

		p->p_rpi.c2c.cp_shmidx = nshm;
		nshm++;

		if (lowp == 0 || LAM_GPSCMP(&p->p_gps, &lowp->p_gps) < 0) {
		    lowp = p;
		}
	    } else {
		p->p_rpi.c2c.cp_shmidx = -1;
	    }
	}

	if (nshm > 1 && lowp == lam_myproc) {
	    proc_init(lam_myproc);
	    if (_shm_create_area(nshm, &lam_myproc->p_rpi.c2c, &outmsg)) {
		return(LAMERROR);
	    }
	}
    }
/*
 * Loop through all processes, initializing the process data and
 * connecting to those on remote nodes not already connected to.
 */
    for (p = lam_topproc(); p; p = lam_nextproc()) {

	if (p->p_mode & LAM_PRPIINIT) {
	    continue;
	}

	proc_init(p);

	if (p == lam_myproc) {
	    continue;
	}

	if (LAM_GPSCMP(mygps, &p->p_gps) >= 0) {
/*
 * Act as a client.
 */
	    rnode = p->p_gps.gps_node;
	    if (rnode > nlinks) return(LAMERROR);
/*
 * Make a TCP connection if on another node or if in an accept/connect
 * or spawned.	
 */
	    if ((rnode != mygps->gps_node) || (p->p_mode & LAM_PCLIENT)) {
		fill_sync(p, lam_myproc, &inmsg);
		if (nrecv(&inmsg)) return(LAMERROR);

		rport = (int) inmsg.nh_data[0];
		raddr = (unsigned char *)
		    &links[rnode].dol_addr.sin_addr.s_addr;

		sock = sfh_sock_open_clt_inet_stm(raddr, rport);
		if (sock < 0) return(LAMERROR);
	    }
	    else {
		_shm_nprocs++;
		continue;
	    }
	} else {
/*
 * Act as a server.
 *
 * Make a TCP connection if on another node or if in an accept/connect
 * or spawned.
 */
	    if ((p->p_gps.gps_node != mygps->gps_node)
			|| (p->p_mode & LAM_PCLIENT)) {

		fill_sync(lam_myproc, p, &outmsg);
		if (nsend(&outmsg)) return(LAMERROR);

		sock = sfh_sock_accept_tmout(servsockd, -1);
		if (sock < 0) return(LAMERROR);
	    }
	    else {
		_shm_nprocs++;
		continue;
	    }
	}
/*
 * Only get here if it's a successful TCP connection.
 */
	p->p_rpi.c2c.cp_sock = sock;
	_tcp_smap[sock] = &p->p_rpi.c2c;
/*
 * Set sockets in non-blocking mode and set the send and receive buffer sizes.
 */
	flag = 1;
#if LAM_HAVE_FIONBIO
	if (ioctl(sock, FIONBIO, &flag) == -1)
	  return(LAMERROR);
#else
	if (fcntl(sock, F_SETFL, O_NONBLOCK) == -1)
	  return(LAMERROR);
#endif
	FD_CLR(sock, &_tcp_block);

	if (setsockopt(sock, IPPROTO_TCP, TCP_NODELAY,
	    		(char *) &flag, sizeof(flag))) {
	    return(LAMERROR);
	}

	if (sfh_sock_set_buf_size(sock, SFH_INET, SO_SNDBUF, bufsize)) {
	    return(LAMERROR);
	}

	if (sfh_sock_set_buf_size(sock, SFH_INET, SO_RCVBUF, bufsize)) {
	    return(LAMERROR);
	}
    }
/*
 * The lowest shared memory process now sends information about the
 * shared memory area to all other processes on the same node which are
 * using shared memory.	 
 */
    if (nshm > 1) {
	if (lam_myproc == lowp) {
	    for (p = lam_topproc(); p; p = lam_nextproc()) {
		if (p == lam_myproc) continue;

		if (p->p_rpi.c2c.cp_shmidx >= 0) {
		    fill_sync(lam_myproc, p, &outmsg);
		    if (nsend(&outmsg)) return(LAMERROR);
		}
	    }
	} else if (lam_myproc->p_rpi.c2c.cp_shmidx >= 0) {
	    fill_sync(lowp, lam_myproc, &inmsg);
	    if (nrecv(&inmsg)) return(LAMERROR);

	    if (_shm_attach_area(nshm, &lam_myproc->p_rpi.c2c, &inmsg)) {
		return(LAMERROR);
	    }
	}
/*
 * Initialize the shared memory in and out short message postboxes for each
 * peer process being comminicated with via shared memory.
 */
	for (p = lam_topproc(); p; p = lam_nextproc()) {

	    if (p == lam_myproc || p->p_rpi.c2c.cp_shmidx < 0) {
		continue;
	    }

	    p->p_rpi.c2c.cp_inbox = (postbox_t *) (_shm_membase + CACHELINESIZE
			+ SHMBOXSIZE * (lam_myproc->p_rpi.c2c.cp_shmidx
			* nshm + p->p_rpi.c2c.cp_shmidx));
	    p->p_rpi.c2c.cp_outbox = (postbox_t *) (_shm_membase + CACHELINESIZE
			+ SHMBOXSIZE * (p->p_rpi.c2c.cp_shmidx * nshm
			+ lam_myproc->p_rpi.c2c.cp_shmidx));
	}
    }

    if (lam_nprocs() > 1) {
	close(servsockd);
	if (links != NULL)
	  free(links);
    }
/*
 * Set up or enlarge the read/write arrays if necessary.
 */
    if (_shm_nprocs > _shm_narray) {
	if (_shm_narray == 0) {
	    _shm_read = (struct c2c_proc **)
			malloc(_shm_nprocs * sizeof(struct c2c_proc *));
	    _shm_write = (struct c2c_proc **)
			malloc(_shm_nprocs * sizeof(struct c2c_proc *));
	} else {
	    _shm_read = (struct c2c_proc **)
		realloc(_shm_read, _shm_nprocs * sizeof(struct c2c_proc *));
	    _shm_write = (struct c2c_proc **)
		realloc(_shm_write, _shm_nprocs * sizeof(struct c2c_proc *));
	}
	
	if (_shm_read == 0 || _shm_write == 0) {
	    return(LAMERROR);
	}
	_shm_narray = _shm_nprocs;
    }

    return(0);
}

/*
 *	proc_init
 *
 *	Function:	- initialize c2c specific process data
 *	Accepts:	- process
 */
static void
proc_init(p)

struct _proc		*p;

{
	p->p_mode |= LAM_PRPIINIT;
	p->p_rpi.c2c.cp_sock = -1;
	p->p_rpi.c2c.cp_mreq = 0;
	p->p_rpi.c2c.cp_inbox = 0;
	p->p_rpi.c2c.cp_outbox = 0;
	p->p_rpi.c2c.cp_write = 0;
	p->p_rpi.c2c.cp_read = 0;
	p->p_rpi.c2c.cp_rreq = 0;
	p->p_rpi.c2c.cp_wreq = 0;
	p->p_rpi.c2c.cp_nbfde = 0;
	p->p_rpi.c2c.cp_extra = 0;
/*
 * Set up to read in an envelope.
 */
	p->p_rpi.c2c.cp_nenvin = ENVSIZE;
	if (!(p->p_mode & LAM_PCLIENT)
			&& (p->p_gps.gps_node == lam_myproc->p_gps.gps_node)) {
		p->p_rpi.c2c.cp_readfn = _shm_proc_read_env;
	} else {
		p->p_rpi.c2c.cp_readfn = _tcp_proc_read_env;
		p->p_rpi.c2c.cp_envbuf = (char *) &p->p_rpi.c2c.cp_env;
	}
}

/*
 *	fill_sync
 *
 *	Function:	- fill in network message sync for connecting
 *	Accepts:	- source process
 *			- destination process
 *			- network message header (filled)
 */
static void
fill_sync(src, dest, head)

struct _proc		*src;
struct _proc		*dest;
struct nmsg		*head;

{
/*
 * This uses in effect synchronization MPI_COMM_WORLD and tag 0.
 */
	_m2l_fillsys(src->p_gps.gps_node, src->p_gps.gps_idx,
		dest->p_gps.gps_node, dest->p_gps.gps_idx, 0, 0, head);
}

/*
 *	finalize1
 *
 *	Function:	- cleanup a process
 *	Accepts:	- process
 *	Returns:	- 0 or LAMERROR
 */
static int
finalize1(p)

struct _proc		*p;

{
	if (p->p_rpi.c2c.cp_sock >= 0) {
		shutdown(p->p_rpi.c2c.cp_sock, 2);
		close(p->p_rpi.c2c.cp_sock);
		p->p_rpi.c2c.cp_sock = -1;
	}

	return(0);
}

/*
 *	_rpi_c2c_fastsend
 *
 *	Function:	- fast blocking send
 *	Accepts:	- buffer to send
 *			- message count
 *			- message datatype
 *			- destination process rank
 *			- message tag
 *			- message communicator
 *	Returns:	- MPI_SUCCESS or error code
 */
int
_rpi_c2c_fastsend(buf, count, dtype, dest, tag, comm)

char			*buf;
int			count;
MPI_Datatype		dtype;
int			dest;
int			tag;
MPI_Comm		comm;

{
    double		local_rep;		/* local data representation */
    double		net_rep;		/* net data representation */
    struct _proc	*destproc;
    char		*packbuf;
    int			packsize;
    int			err;
/*
 * Check common arguments.
 */
    if (count < 0) {
	return(lam_mkerr(MPI_ERR_COUNT, 0));
    }

    if (dtype == MPI_DATATYPE_NULL || (!dtype->dt_commit)) {
	return(lam_mkerr(MPI_ERR_TYPE, 0));
    }

    if (LAM_IS_INTER(comm)) {
	if ((dest < 0) || (dest >= comm->c_rgroup->g_nprocs)) {
	    return(lam_mkerr(MPI_ERR_RANK, 0));
	}

	destproc = comm->c_rgroup->g_procs[dest];
    }
    else {
	if ((dest < 0) || (dest >= comm->c_group->g_nprocs)) {
	    return(lam_mkerr(MPI_ERR_RANK, 0));
	}

	destproc = comm->c_group->g_procs[dest];
    }
/*
 * Handle zero length messages.
 */
    if (count == 0 || dtype->dt_size == 0) {
	packbuf = buf;
	packsize = 0;
    }
    else {
	local_rep = 1.1;
	ltotf8(&local_rep, &net_rep);
/*
 * If contiguous, use the caller's buffer.
 */
	packsize = count * dtype->dt_size;

	if ((dtype->dt_flags & LAM_DTNOPACK)
		&& ((dtype->dt_flags & LAM_DTNOXADJ) || count == 1)
		&& ((local_rep == net_rep) || lam_homog)) {
	    packbuf = buf;
/*
 * Check for bad buffer.
 */
	    if (packbuf == 0) {
		return(lam_mkerr(MPI_ERR_BUFFER, 0));
	    }
	}
/*
 * Otherwise allocate a buffer and pack the message into it.
 */
	else {
	    packbuf = malloc(packsize);
	    if (packbuf == 0) {
		return(lam_mkerr(MPI_ERR_OTHER, errno));
	    }

	    if (lam_pack(buf, count, dtype, packbuf, packsize) < 0) {
		return(lam_mkerr(MPI_ERR_INTERN, errno));
	    }
	}
    }
/*
 * Call appropriate transport protocol.
 */
    if (destproc->p_rpi.c2c.cp_inbox != 0) {
	err = _shm_fastsend(packbuf, packsize,
				&destproc->p_rpi.c2c, dest, tag, comm);
    } else {
	err = _tcp_fastsend(packbuf, packsize,
				&destproc->p_rpi.c2c, dest, tag, comm);
    }

    if (packbuf != buf) {
	free(packbuf);
    }

    return(err);
}

/*
 *	_rpi_c2c_fastrecv
 *
 *	Function:	- fast blocking receive
 *	Accepts:	- buffer to receive into
 *			- message count
 *			- message datatype
 *			- source process rank
 *			- message tag (inout)
 *			- message communicator
 *			- status (out)
 *			- seqnum (out)
 *	Returns:	- MPI_SUCCESS or error code
 */
int
_rpi_c2c_fastrecv(buf, count, dtype, src, tag, comm, stat, seqnum)

char			*buf;
int			count;
MPI_Datatype		dtype;
int			src;
int			*tag;
MPI_Comm		comm;
MPI_Status		*stat;
int			*seqnum;

{
    double		local_rep;		/* local data representation */
    double		net_rep;		/* net data representation */
    struct _proc	*srcproc;
    char		*packbuf;
    int			packsize;
    int			err;
/*
 * Check common arguments.
 */
    if (count < 0) {
	return(lam_mkerr(MPI_ERR_COUNT, 0));
    }

    if (dtype == MPI_DATATYPE_NULL || (!dtype->dt_commit)) {
	return(lam_mkerr(MPI_ERR_TYPE, 0));
    }

    if (LAM_IS_INTER(comm)) {
	if ((src < 0) || (src >= comm->c_rgroup->g_nprocs)) {
	    return(lam_mkerr(MPI_ERR_RANK, 0));
	}

	srcproc = comm->c_rgroup->g_procs[src];
    }
    else {
	if ((src < 0) || (src >= comm->c_group->g_nprocs)) {
	    return(lam_mkerr(MPI_ERR_RANK, 0));
	}

	srcproc = comm->c_group->g_procs[src];
    }
/*
 * Handle zero length messages.
 */
    if (count == 0 || dtype->dt_size == 0) {
	packbuf = buf;
	packsize = 0;
    }
    else {
	local_rep = 1.1;
	ltotf8(&local_rep, &net_rep);
/*
 * If contiguous, use the caller's buffer.
 */
	packsize = count * dtype->dt_size;

	if ((dtype->dt_flags & LAM_DTNOPACK)
		&& ((dtype->dt_flags & LAM_DTNOXADJ) || count == 1)
		&& ((local_rep == net_rep) || lam_homog)) {
	    packbuf = buf;
/*
 * Check for bad buffer.
 */
	    if (packbuf == 0) {
		return(lam_mkerr(MPI_ERR_BUFFER, 0));
	    }
	}
/*
 * Otherwise allocate a buffer.
 */
	else {
	    packbuf = malloc(packsize);
	    if (packbuf == 0) {
		return(lam_mkerr(MPI_ERR_OTHER, errno));
	    }
	}
    }
/*
 * Call appropriate transport protocol.
 */
    if (srcproc->p_rpi.c2c.cp_sock < 0) {
	err = _shm_fastrecv(packbuf, &packsize,
				&srcproc->p_rpi.c2c, src, tag, comm, seqnum);
    } else {
	err = _tcp_fastrecv(packbuf, &packsize,
				&srcproc->p_rpi.c2c, src, tag, comm, seqnum);
    }

    if (stat != MPI_STATUS_IGNORE) {
	stat->MPI_SOURCE = src;
	stat->MPI_TAG = *tag;
	stat->MPI_ERROR = err;
	stat->st_length = packsize;
    }
/*
 * Unpack received message into user's buffer if necessary.
 */
    if (packbuf != buf) {
	if (lam_unpack(packbuf, packsize, buf, count, dtype) < 0) {
	    return(lam_mkerr(MPI_ERR_INTERN, errno));
	}

	free(packbuf);
    }

    return(err);
}
