IUBio

fasta and blast searches

Don Gilbert gilbertd at bio.indiana.edu
Thu Feb 20 18:54:09 EST 1997


Thure,
 
I customized version 4 of SRS to produce a file of indices
into databanks as a search result, and feed that into
FastA (which was also customized) to allow one to do 
fasta searches w/o having to generate a temp data file
that was huge.  Is that of any interest to you or others (the srs
customization)?  I don't think I would want to run an SRS-fasta
search that allowed users to generate temp files in the multi-gigabyte
range, and even if I had that disk space, it would likely be slower
than generating byte indices.
 
-- Don

The main function I had to add to srs4 was to list file indices,
in the seqlib.c file, i added the SlbListFSE function below,
then added an item to getz.c to allow calling it, plus a few
configuration items (in the odd files).  I haven't had a chance
to port this to srs 5 yet, however I will do so when time permits.

see http://iubio.bio.indiana.edu/srsfasta/



	/* --- ListFSE parts --------------*/

/* dgg - from hash.c, for hashing file names... */
static  long HashString(void *s)
{
  register unsigned long h=0, g;
  register char *p = (char *) s;
  while (*p) {
    h= (h << 4) + *p++;
    if ((g= (h & 0xf0000000)))
      h= (h ^ (g >> 24)) ^ g;
  	}
  return (long) h;
}


static long   lastdocstart = -1;
static short  lastfileid = -1;
static long   nFofn = 0, maxFofn = 0;
static long * hashFofn = NULL;
char * gOutentrySuffix = ".indices";   
char * gOutnamesSuffix = ".names";

void SlbListFSE(ENTRYo *entry, char *setName, int entryN, int entryCurrN)
{
	/* dgg addition 
	- list sequence position in file 
	*/
	
	if ( EntryOpenData (entry)) {
		short   entrynuminfile, seqformat, fileid;
		long	  docstart, datastart, dataend;
		char	* filename;
		IDoENTRY  nextid;
	  ENTRYo   *nextentry;
	  
		filename= entry->file[1]->nam;
		fileid= 0; /* this must be set to filename entry in file of filenames ! */
		entrynuminfile= entry->id->fip; /* need to look at next entry for end index !*/
		docstart= entry->fip[0];
		datastart= entry->fip[1];
		/* !! NOTE: for GENBANK (? and others) the entry->fip
		currently points at the "ORIGIN" line above data. 
		In SwissProt,EMBL it points to "SQ   SEQUENCE   544 AA..." above data
		In PIR, it points to 'SEQUENCE' line above data.
		Must bypass this line here or in index users progs... 
		*/
 		datastart= ftell(entry->file[1]->fil); /* this is after 1st line read of EntryOpenData() */

		dataend= 0; /* flag we don't know end */
		/* we want real file format, in fasta/pearson lib file values */
		seqformat= entry->lib->form->fil_t[1]->type - SLBxPEARSON;
			    
    if (!entryCurrN || !file) {
			/* force output to files -- file of entry indices & file of filenames */
    	char  *tmp, *outentry;
    	char  outDirName[FILxXNAM+1], outFile[FILxXNAM+1];
	    
	    lastdocstart= 0; lastfileid= 0;
			nFofn = 0; 
			maxFofn= 20;
			if (hashFofn) free(hashFofn);
			hashFofn = (long*) malloc( maxFofn * sizeof(long));
	    
	    if ((tmp = ParGetStr ("tempDirName"))) sprintf (outDirName, "%s", tmp);    
	    else  *outDirName = '\0';
    
    	outentry= ParGetStr("listFSE");
			sprintf (outFile, "%s%s%s", outDirName, outentry, gOutentrySuffix);
#if 1
			_FilLN(outFile);
      file = fopen( outFile, "r+");
      if (!file) file = fopen( outFile, "w");
      else fseek(file, 0, 2);
#else
      file = FilOpenU(outFile, &errCode);
      _ErrMsg2 (errCode, outFile);
#endif

			sprintf (outFile, "%s%s%s", outDirName, outentry, gOutnamesSuffix);
			_FilLN(outFile);
      fofn = fopen( outFile, "r+");
      if (!fofn) 
      	fofn = fopen( outFile, "w");
      else {
      		/* read & store hashFofn values */
      	char *ep, aline[FILxXNAM+20];
      	*aline= '\0';
     		while ( fgets( aline, sizeof(aline), fofn) ) {
     			ep= aline;
     			while (*ep && *ep != '\t' && *ep != ' ' && *ep != '\n') ep++;
      	  if (*ep) *ep= '\0';
		   		if (nFofn >= maxFofn) {
		   			maxFofn = nFofn + 20;
		   			hashFofn= ( long*) realloc(hashFofn, maxFofn * sizeof( long));
		   			}
      		if (hashFofn) hashFofn[nFofn]= HashString( aline);
      		nFofn++;
      		}
      	}
      }
      
  	if (fofn) {
			/* !! save time by storing filenames in local array !? */
	    short linenum, more;
	    long  namelen;
			long filehash;
			
			more= 1;
			filehash= HashString( filename);
			if (hashFofn) for (linenum=0; linenum<nFofn; linenum++) {
				if (filehash == hashFofn[linenum]) { 
					fileid= linenum;
					more= 0;
					break;
					}
				}
				
      if (more) {
	char * libname;
      	libname= LibGetName(entry->lib,"full"); 
      	/*libname= entry->lib->lnam[0]; /* short name !? */

	fileid= nFofn;
     	fseek( fofn, 0, 2);
      	fprintf( fofn,"%s\t%d\t%d\t%s\n", filename, seqformat, fileid,libname);
	   		if (nFofn >= maxFofn) {
	   			maxFofn = nFofn + 20;
	   			hashFofn= ( long*) realloc(hashFofn, maxFofn * sizeof( long));
	   			}
	   		if (hashFofn) hashFofn[nFofn]= filehash;
	   		nFofn++;
   			linenum++;
    		}
    	}

	  if (fileid == lastfileid && docstart == lastdocstart)
	  	goto skipListFSE;
    
		/* find data end -- shouldn't this be in entry record somewhere ! */
		IdCopy( &nextid, entry->id);
		nextid.fip++; /* skip to next entry in library */
		nextentry = EntryOpen (&nextid);
   	if (nextentry) { 
		  if ( EntryOpenText ( nextentry) ) dataend= nextentry->fip[0]; 
    	EntryClose (&nextentry);
			}
			
		/* now write it all out -- use binary output to file for compactness? */			
		fprintf( file, "%d\t%ld\t%ld\t%ld\n", fileid, docstart, datastart, dataend);

skipListFSE: 
		 /* save to screen out dups */
		/* strncpy(lastename, EntryGetFullName(entry), sizeof(lastename)); */
		lastfileid= fileid;
		lastdocstart= docstart;
		}

  if ( entryCurrN == entryN - 1 ) {
		if (file) { fclose (file); file = NULL; }  
		if (fofn) { fclose(fofn); fofn= NULL; } /* dgg */
		if (hashFofn) { free(hashFofn); hashFofn= NULL; }
		}
}	

	/*--------end ListFSE----------- */	


Odd file additions ----
diff -bwrc /net/fly/b4/srs/odd/arglist.sdl odd/arglist.sdl
*** /net/fly/b4/srs/odd/arglist.sdl     Tue Sep  5 11:23:44 1995
--- odd/arglist.sdl     Sat Sep  9 10:52:21 1995
***************
*** 68,74 ****
      #arg /name="-t"    /parameter="printText"
      #arg /name="-d"    /parameter="printData"
      #arg /name="-fse"  /parameter="listFSE"     ! dgg added
!               #arg /name="-tdir" /parameter="tempDirName" ! dgg
      #arg /name="-f"    /parameter="fieldList"
      #arg /name="-l"    /parameter="libList" 
      #arg /name="-w"    /parameter="makeWild"
--- 68,74 ----
      #arg /name="-t"    /parameter="printText"
      #arg /name="-d"    /parameter="printData"
      #arg /name="-fse"  /parameter="listFSE"     ! dgg added
!     #arg /name="-tdir"  /parameter="tempDirName"     ! dgg added
      #arg /name="-f"    /parameter="fieldList"
      #arg /name="-l"    /parameter="libList" 
      #arg /name="-w"    /parameter="makeWild"

--
-- d.gilbert--biocomputing--indiana u--bloomington--gilbertd at bio.indiana.edu




More information about the Bio-srs mailing list

Send comments to us at biosci-help [At] net.bio.net