Thure,
I customized version 4 of SRS to produce a file of indices
into databanks as a search result, and feed that into
FastA (which was also customized) to allow one to do
fasta searches w/o having to generate a temp data file
that was huge. Is that of any interest to you or others (the srs
customization)? I don't think I would want to run an SRS-fasta
search that allowed users to generate temp files in the multi-gigabyte
range, and even if I had that disk space, it would likely be slower
than generating byte indices.
-- Don
The main function I had to add to srs4 was to list file indices,
in the seqlib.c file, i added the SlbListFSE function below,
then added an item to getz.c to allow calling it, plus a few
configuration items (in the odd files). I haven't had a chance
to port this to srs 5 yet, however I will do so when time permits.
see http://iubio.bio.indiana.edu/srsfasta/
/* --- ListFSE parts --------------*/
/* dgg - from hash.c, for hashing file names... */
static long HashString(void *s)
{
register unsigned long h=0, g;
register char *p = (char *) s;
while (*p) {
h= (h << 4) + *p++;
if ((g= (h & 0xf0000000)))
h= (h ^ (g >> 24)) ^ g;
}
return (long) h;
}
static long lastdocstart = -1;
static short lastfileid = -1;
static long nFofn = 0, maxFofn = 0;
static long * hashFofn = NULL;
char * gOutentrySuffix = ".indices";
char * gOutnamesSuffix = ".names";
void SlbListFSE(ENTRYo *entry, char *setName, int entryN, int entryCurrN)
{
/* dgg addition
- list sequence position in file
*/
if ( EntryOpenData (entry)) {
short entrynuminfile, seqformat, fileid;
long docstart, datastart, dataend;
char * filename;
IDoENTRY nextid;
ENTRYo *nextentry;
filename= entry->file[1]->nam;
fileid= 0; /* this must be set to filename entry in file of filenames ! */
entrynuminfile= entry->id->fip; /* need to look at next entry for end index !*/
docstart= entry->fip[0];
datastart= entry->fip[1];
/* !! NOTE: for GENBANK (? and others) the entry->fip
currently points at the "ORIGIN" line above data.
In SwissProt,EMBL it points to "SQ SEQUENCE 544 AA..." above data
In PIR, it points to 'SEQUENCE' line above data.
Must bypass this line here or in index users progs...
*/
datastart= ftell(entry->file[1]->fil); /* this is after 1st line read of EntryOpenData() */
dataend= 0; /* flag we don't know end */
/* we want real file format, in fasta/pearson lib file values */
seqformat= entry->lib->form->fil_t[1]->type - SLBxPEARSON;
if (!entryCurrN || !file) {
/* force output to files -- file of entry indices & file of filenames */
char *tmp, *outentry;
char outDirName[FILxXNAM+1], outFile[FILxXNAM+1];
lastdocstart= 0; lastfileid= 0;
nFofn = 0;
maxFofn= 20;
if (hashFofn) free(hashFofn);
hashFofn = (long*) malloc( maxFofn * sizeof(long));
if ((tmp = ParGetStr ("tempDirName"))) sprintf (outDirName, "%s", tmp);
else *outDirName = '\0';
outentry= ParGetStr("listFSE");
sprintf (outFile, "%s%s%s", outDirName, outentry, gOutentrySuffix);
#if 1
_FilLN(outFile);
file = fopen( outFile, "r+");
if (!file) file = fopen( outFile, "w");
else fseek(file, 0, 2);
#else
file = FilOpenU(outFile, &errCode);
_ErrMsg2 (errCode, outFile);
#endif
sprintf (outFile, "%s%s%s", outDirName, outentry, gOutnamesSuffix);
_FilLN(outFile);
fofn = fopen( outFile, "r+");
if (!fofn)
fofn = fopen( outFile, "w");
else {
/* read & store hashFofn values */
char *ep, aline[FILxXNAM+20];
*aline= '\0';
while ( fgets( aline, sizeof(aline), fofn) ) {
ep= aline;
while (*ep && *ep != '\t' && *ep != ' ' && *ep != '\n') ep++;
if (*ep) *ep= '\0';
if (nFofn >= maxFofn) {
maxFofn = nFofn + 20;
hashFofn= ( long*) realloc(hashFofn, maxFofn * sizeof( long));
}
if (hashFofn) hashFofn[nFofn]= HashString( aline);
nFofn++;
}
}
}
if (fofn) {
/* !! save time by storing filenames in local array !? */
short linenum, more;
long namelen;
long filehash;
more= 1;
filehash= HashString( filename);
if (hashFofn) for (linenum=0; linenum<nFofn; linenum++) {
if (filehash == hashFofn[linenum]) {
fileid= linenum;
more= 0;
break;
}
}
if (more) {
char * libname;
libname= LibGetName(entry->lib,"full");
/*libname= entry->lib->lnam[0]; /* short name !? */
fileid= nFofn;
fseek( fofn, 0, 2);
fprintf( fofn,"%s\t%d\t%d\t%s\n", filename, seqformat, fileid,libname);
if (nFofn >= maxFofn) {
maxFofn = nFofn + 20;
hashFofn= ( long*) realloc(hashFofn, maxFofn * sizeof( long));
}
if (hashFofn) hashFofn[nFofn]= filehash;
nFofn++;
linenum++;
}
}
if (fileid == lastfileid && docstart == lastdocstart)
goto skipListFSE;
/* find data end -- shouldn't this be in entry record somewhere ! */
IdCopy( &nextid, entry->id);
nextid.fip++; /* skip to next entry in library */
nextentry = EntryOpen (&nextid);
if (nextentry) {
if ( EntryOpenText ( nextentry) ) dataend= nextentry->fip[0];
EntryClose (&nextentry);
}
/* now write it all out -- use binary output to file for compactness? */
fprintf( file, "%d\t%ld\t%ld\t%ld\n", fileid, docstart, datastart, dataend);
skipListFSE:
/* save to screen out dups */
/* strncpy(lastename, EntryGetFullName(entry), sizeof(lastename)); */
lastfileid= fileid;
lastdocstart= docstart;
}
if ( entryCurrN == entryN - 1 ) {
if (file) { fclose (file); file = NULL; }
if (fofn) { fclose(fofn); fofn= NULL; } /* dgg */
if (hashFofn) { free(hashFofn); hashFofn= NULL; }
}
}
/*--------end ListFSE----------- */
Odd file additions ----
diff -bwrc /net/fly/b4/srs/odd/arglist.sdl odd/arglist.sdl
*** /net/fly/b4/srs/odd/arglist.sdl Tue Sep 5 11:23:44 1995
--- odd/arglist.sdl Sat Sep 9 10:52:21 1995
***************
*** 68,74 ****
#arg /name="-t" /parameter="printText"
#arg /name="-d" /parameter="printData"
#arg /name="-fse" /parameter="listFSE" ! dgg added
! #arg /name="-tdir" /parameter="tempDirName" ! dgg
#arg /name="-f" /parameter="fieldList"
#arg /name="-l" /parameter="libList"
#arg /name="-w" /parameter="makeWild"
--- 68,74 ----
#arg /name="-t" /parameter="printText"
#arg /name="-d" /parameter="printData"
#arg /name="-fse" /parameter="listFSE" ! dgg added
! #arg /name="-tdir" /parameter="tempDirName" ! dgg added
#arg /name="-f" /parameter="fieldList"
#arg /name="-l" /parameter="libList"
#arg /name="-w" /parameter="makeWild"
--
-- d.gilbert--biocomputing--indiana u--bloomington--gilbertd at bio.indiana.edu