/* seq2mtx - convert single sequence to pseudo IMPALA mtx file */

/* Copyright (C) 2000 D.T. Jones */

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <math.h>
#include <string.h>

//#define DEBUG

#define MAXSEQLEN 65536

#define FALSE 0
#define TRUE 1

#define SQR(x) ((x)*(x))
#define MIN(x,y) (((x)<(y))?(x):(y))
#define MAX(x,y) (((x)>(y))?(x):(y))

const char *rescodes = "ARNDCQEGHILKMFPSTWYVBZX";

/*  BLOSUM 62 */
const short           aamat[23][23] =
{
    {4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0},
    {-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1},
    {-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1},
    {-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4,
     -3, -3, 4, 1, -1},
    {0, -3, -3, -3,10, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2,
     -2, -1, -3, -3, -2},
    {-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2,
     -1, -2, 0, 3, -1},
    {-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3,
     -2, -2, 1, 4, -1},
    {0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2,
     -3, -3, -1, -2, -1},
    {-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2,
     2, -3, 0, 0, -1},
    {-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3,
     -1, 3, -3, -3, -1},
    {-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2,
     -1, 1, -4, -3, -1},
    {-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3,
     -2, -2, 0, 1, -1},
    {-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1,
     -1, 1, -3, -1, -1},
    {-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1,
     3, -1, -3, -3, -1},
    {-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4,
     -3, -2, -2, -1, -2},
    {1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3,
     -2, -2, 0, 0, 0},
    {0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2,
     -2, 0, -1, -1, 0},
    {-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11,
     2, -3, -4, -3, -2},
    {-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2,
     7, -1, -3, -2, -1},
    {0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3,
     -1, 4, -3, -2, -1},
    {-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4,
     -3, -3, 4, 1, -1},
    {-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3,
     -2, -2, 1, 4, -1},
    {0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2,
     -1, -1, -1, -1, 4}
};

/* Dump a rude message to standard error and exit */
void
  fail(char *errstr)
{
    fprintf(stderr, "\n*** %s\n\n", errstr);
    exit(-1);
}

/* Convert AA letter to numeric code (0-22) */
int
  aanum(int ch)
{
    static int aacvs[] =
    {
	999, 0, 20, 4, 3, 6, 13, 7, 8, 9, 22, 11, 10, 12, 2,
	22, 14, 5, 1, 15, 16, 22, 19, 17, 22, 18, 21
    };

    return (isalpha(ch) ? aacvs[ch & 31] : 22);
}
char *rootname(const char* filename, char* rtname)/*{{{*/
/*****************************************************************************
 * rootname
 * given the file name, 
 * return the rootname of the filename
 ****************************************************************************/
{
    const char *pch;
    char *pstr;
    if((pch = strrchr(filename,'/')) != NULL) {
        pstr = (char*) pch+1;
    } else { 
        pstr = (char*) filename;
    }

    if((pch = strrchr(pstr,'.')) != NULL) {
        strncpy(rtname,pstr, (int)(pch - pstr));
        rtname[pch-pstr]='\0';
    } else {
        rtname = pstr;
    }
    return rtname;
}
/*}}}*/
char** ReadFileList(const char* filename, char **filenameList, int* p_cntfile)/*{{{*/
{
    int cntfile=0;
    int i = 0;
    FILE* fpin = fopen(filename,"r");
    if( fpin == NULL) {
        fail("Unable to open listFile!");
    }
    if (fseek(fpin, 0 , SEEK_END) != 0) {
        fail("fseek of the listFile failed");
    }
    int filesize = ftell(fpin);
    char *string = 0;
    /*read in the while file to string*/
    string = malloc((sizeof(char)*(filesize+1)));
    if (fseek(fpin,0,SEEK_SET) != 0) {
        fail("fseek of the listFile failed");
    }
    if (fread(string, sizeof(char),filesize, fpin) != filesize) {
        fail("fread of the listFile failed");
    }
    fclose(fpin);
    string[filesize]=0;


    /*get the number of lines*/
    int cntline=0;
    for (i = 0; i < filesize; i++) {
        if (string[i] == '\n') {
            cntline++;
        }
    }
    filenameList=malloc(sizeof(char*)*(cntline+1));
    /*parse the string to filenameList*/
    char *pch;
    pch = strtok(string, "\n");
    int n=0;
    while(pch!=NULL)
    {
        n = strlen(pch);
        if (n > 0)
        {
            filenameList[cntfile] = malloc(sizeof(char)*(n+1));
            strcpy(filenameList[cntfile], pch);
            cntfile ++;
        }
        pch = strtok(NULL,"\n");
    }
#ifdef DEBUG
    for (i=0;i<cntfile; i++)
    {
        fprintf(stderr,"%d:%s\n", i, filenameList[i]);
    }
#endif 
    free(string);
    *p_cntfile = cntfile;
    return filenameList;

}/*}}}*/

/* This routine will read in one sequence from a database file. The
   sequence can be in any of the supported formats. Returns length
   of sequence.
*/
int
  getseq(char *dbname, char *dseq, FILE * lfil)
{
    int i, j, len;
    short badln, fformat;
    enum
    {
	UNKNOWN, EMBL, FASTA, OWL, GCG
    };
    char buf[MAXSEQLEN], split;
    int offset;

    offset = j = 0;

    if (!fgets(buf, MAXSEQLEN, lfil))
	return (-1);
    if (strstr(buf, "of:") != NULL && strstr(buf, "check:") != NULL)
	fformat = GCG;
    else if (strncmp(buf, "ID   ", 5) == 0)
	fformat = EMBL;
    else if (buf[0] == '>' && (buf[1] == '>' || buf[3] == ';'))
	fformat = OWL;
    else if (buf[0] == '>')
	fformat = FASTA;
    else
    {
	fprintf(stderr, "WARNING: Attempting to interpret input file with unknown format");
	fformat = UNKNOWN;
    }

    switch (fformat)
    { 
    case GCG:
	sscanf(strstr(buf, "of:")+3, "%s", dbname);
	while (strstr(buf, "..") == NULL)
	    fgets(buf, MAXSEQLEN, lfil);
	fgets(buf, MAXSEQLEN, lfil);
	break;
	
    case EMBL:
	strncpy(dbname, buf + 5, 70);
	while (buf[0] != ' ')
	    fgets(buf, MAXSEQLEN, lfil);
	break;
	
    case OWL:
	fgets(buf, MAXSEQLEN, lfil);
	strncpy(dbname, buf, 70);
	fgets(buf, MAXSEQLEN, lfil);
	break;
	
    case FASTA:
	strncpy(dbname, buf + 1, 70);
	fgets(buf, MAXSEQLEN, lfil);
	break;
	
    default:
	/* Try to find a line which looks like a protein sequence */
	do
	{
	    badln = (strpbrk(buf, "JjOoUu<>#$%&@") != NULL);
	    if (badln && !fgets(buf, MAXSEQLEN, lfil))
		return (-1);
	}
	while (badln);
	strcpy(dbname, "<NO NAME>");
	break;
    }

    if (dbname[(len = strlen(dbname)) - 1] == '\n')
	dbname[--len] = '\0';
    if (len >= 70)
	dbname[70] = '\0';

    for (;;)
    {
	if (!strncmp(buf, "//", 2))
	    break;
	len = strlen(buf);
	for (i = offset; i < len && j < MAXSEQLEN; i++)
	{
	    split = islower(buf[i]) ? toupper(buf[i]) : buf[i];
	    if (split == '@' || (fformat == OWL && split == '*'))
	    {
		dseq[j] = '\0';
		while (fgets(buf, MAXSEQLEN, lfil));
		return (j);
	    }
	    if (isalpha(split))
		dseq[j++] = split;
	    else if (buf[i] == '\n')
		break;
	}
	if (!fgets(buf, MAXSEQLEN, lfil))
	    break;
    }

    if (j == MAXSEQLEN)
	printf("\nWARNING: sequence %s over %d long; truncated!\n",
	       dbname, MAXSEQLEN);

    dseq[j] = '\0';
    return (j);
}

void PrintHelp()
{
    printf("usage:  seq2mtx [options] single-fasta-seq-file\n");
    printf("\n");
    printf("options:\n");
    printf("    -outpath <dir> : set output path, default=./\n");
    printf("    -l <file>      : set the list file\n");
    printf("    -h|--help  : print this help message\n");
    printf("\n");
    printf("updated 2010-11-18, Nanjiang\n");
}

int main(int argc, char **argv)
{
    int i, j, seqlen=0;
    char outpath[500]="./";
    char listFile[500]="";
    char fastaFile[500]="";

    if (argc < 2)
    {
        PrintHelp();
        return 1;
    }
//fail("Usage: seq2psi seq-file");

    i = 1;

	while(i < argc )
	{
		if(strcmp(argv[i],"-h") == 0 ||strcmp(argv[i],"--help")==0 )
		{
            PrintHelp();
			break;
		}
        else if(strcmp(argv[i],"-outpath") == 0||strcmp(argv[i],"--outpath") == 0)
        {
            strcpy(outpath,argv[i+1]);
            i += 2;
        }
        else if(strcmp(argv[i],"-l") == 0||strcmp(argv[i],"--l") == 0)
        {
            strcpy(listFile,argv[i+1]);
            i += 2;
        }
	    else
        {
            strcpy(fastaFile,argv[i]);
            i += 1;
        }
	}

    char desc[65536], seq[MAXSEQLEN], buf[65536], *p;
    char *ncbicodes = "XAXCDEFGHIKLMNPQRSTVWXYXXX";
    FILE *ifp;
    FILE *fpout = NULL;
    if (strcmp(fastaFile,"")!= 0)
    {
        fpout = stdout;
        ifp = fopen(fastaFile, "r");
        if (!ifp)
            fail("Unable to open sequence file!");

        seqlen = getseq(desc, seq, ifp);
        fclose(ifp);

        if (seqlen < 5 || seqlen >= MAXSEQLEN)
            fail("Sequence length error!");

        fprintf(fpout, "%d\n", seqlen);

        fprintf(fpout,"%s",seq);
        /*for (i=0; i<seqlen; i++)*/
            /*putchar(seq[i]);*/

        fprintf(fpout, "\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n");

        for (i=0; i<seqlen; i++)
        {
            for (j=0; j<26; j++)
                if (ncbicodes[j] != 'X')
                    fprintf(fpout, "%d  ", aamat[aanum(seq[i])][aanum(ncbicodes[j])]*100);
                else
                    fprintf(fpout, "-32768  ");
            /*putchar('\n');*/
            fprintf(fpout,"\n");
        }
    }
    else if (strcmp(listFile, "")!= 0)
    {
        char cmd[500]="";
        sprintf(cmd, "mkdir -p %s", outpath);
        system(cmd);

        int cntfile=0;
        char **filenameList=NULL;
        filenameList = ReadFileList(listFile,filenameList, &cntfile);

        char rtname[500]="";
        char outfile[500]="";
        /*fprintf(stderr,"cntfile=%d\n", cntfile);*/
        int ifile;
        for (ifile = 0; ifile < cntfile ; ifile ++)
        {
#ifdef DEBUG
            fprintf(stderr,"%d: %s\n", ifile, filenameList[ifile]);
#endif
            rootname(filenameList[ifile], rtname);
            sprintf(outfile,"%s/%s.mtx", outpath, rtname);
            fpout=fopen(outfile,"w");
            ifp = fopen(filenameList[ifile], "r");
            if (!ifp)
                fail("Unable to open sequence file!");

            seqlen = getseq(desc, seq, ifp);
            fclose(ifp);

            if (seqlen < 5 || seqlen >= MAXSEQLEN)
                fail("Sequence length error!");

            fprintf(fpout, "%d\n", seqlen);

            fprintf(fpout,"%s", seq);
            /*for (i=0; i<seqlen; i++)*/
                /*putchar(seq[i]);*/

            fprintf(fpout, "\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n0\n");

            for (i=0; i<seqlen; i++)
            {
                for (j=0; j<26; j++)
                    if (ncbicodes[j] != 'X')
                        fprintf(fpout, "%d  ", aamat[aanum(seq[i])][aanum(ncbicodes[j])]*100);
                    else
                        fprintf(fpout, "-32768  ");
                fprintf(fpout,"\n");
            }
            fclose(fpout);
        }
        for (ifile = 0; ifile<cntfile;ifile++)
        {
            free(filenameList[ifile]);
        }
        free(filenameList);
    }
    return 0;
}
