IUBio

FASTA style database in SRS (one solution!)

Staffan Bergh staffan at vespucci.sto.se.pnu.com
Fri Oct 31 02:34:45 EST 1997


Here's another solution. Enjoy.

/staffan

Staffan Bergh
Pharmacia & Upjohn, P14:5, SE-112 87 Stockholm, Sweden

email:  Staffan.Bergh at eu.pnu.com + Don't let that horse eat that violin
phone:  (int+46)  08 695 9884    +               cried Chagall's mother
fax:    (int+46)  08 695 4084    + but he kept right on painting
mobile: (int+46) 070 898 8829    +             -- Lawrence Ferlinghetti

### fastafile.is ###

#!/bin/env icarus
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#    $Id: fastafile.is,v 1.2 1997/10/13 13:08:55 staffan Exp $
#    $Log: fastafile.is,v $
#    Revision 1.2  1997/10/13 13:08:55  staffan
#    Check in of file from test directory
#
#    Revision 1.2  1997/10/13 13:05:57  staffan
#    Fixed a lot of things. Now works!
#
#    Revision 1.1  1997/10/06 16:03:11  staffan
#    Initial revision
#
#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

$rules={
  entry:    ~ {$In:[file:text] $Out pre{$Skip:0}}
	      ('>' {$Not} ln)*
              ('>' {$entryFip=$Fip} name {$Wrt $entryName=$1} appLn)? ~

  # data fields
  fields:   ~ {$In:entry $Out $Skip:1} 
	       (name  {$Wrt:[id]} / *([^\n]*)/ {$Wrt:[des]}) ~

  sequence: ~ {$In:[file:seq share:text] $Out
		 pre {$s=$SeqNew:$entryName $seqFip=$Fip} 
		  $Wrt:[s:$entryName] $SeqMake:$s}
		(idline ln (/^([^>]+)/ {$SeqApp:[$s s:$Ct]})*) ~

  # indexing
  id:       ~ {$In:[fields c:id] $Out} 
		/.+/ {$Wrt} ~
  des:    ~ {$In:[fields c:des] $Out}
		(/[0-9]+\.[0-9]+/ {$Uniq} | # catch floats before we see a fullstop
		  stopwords |               # skip stopwords
		 /[\.\;] +/ |               # a trailing punct. mark with spaces after is a separator
		 /[^-A-Za-z0-9\_]+/ |       # as is anything that doesn't match a alphanum etc 
		 (/[-A-Za-z0-9\:\_]+/       # everything else is a word
			{ 
			  $split = $Trim:[$Ct skip:":"] # trim off a trailing ':'
			  $Uniq:[s:$split]
			}
		 )
	        )* ~

  # other  
  lead:     ~ '>' ~
  name:     ~ /[-a-zA-Z0-9_\.]+/ ~
  title:    ~ / *[^\n]*/ ~
  idline:   ~ /^>/ name  ~
  appLn:    ~ /[^\n]*\n/ {$App} ~
  ln:       ~ /[^\n]*\n/  ~
  word:     ~ /[-A-Za-z0-9_]+/ ~
  sep:      ~ /[^-A-Za-z0-9_]+/ ~
  seqline:  ~ /^[A-Za-z]+\n/ ~
  lnbreak:  ~ /\n$/ ~
  stopwords: ~ ('in' | 'for' | 'to' | 'by' | 'of') ~
  
}

if:$TestMode {
  $job = $JobNew:[prod:$rules skip:" " fileName:'/net/honken/disk2/databases/genomes/fasta/seq/h_influenzae.seq']
  while:$JobHasInput:$job {
    $JobTokens:[$job name:entry  print:1] 
    $JobTokens:[$job name:sequence print:1]
#    $JobTokens:[$job name:fields  print:1] 
#    $JobTokens:[$job name:'fields|id' print:1]
#    $JobTokens:[$job name:'id' print:1]
#    $JobTokens:[$job name:'des' print:1]
    $JobNext:$job
#    $print:"----------------------------------------------\n"
  }
}

### fastafile.i  ###

#!/bin/env icarus
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#    $Id: fastafile.i,v 1.1 1997/10/13 15:58:52 staffan Exp $
#    $Log: fastafile.i,v $
#    Revision 1.1  1997/10/13 15:58:52  staffan
#    Initial revision
#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# You need to select one of these FORMATs in your .i file

FASTASEQS_FORMAT:$libformat:[syntax:@FASTADB_SYNTAX contains:@DNASEQ_DATA
  tableFormat:left 
  fileType:{@FASTADB_FILE @FASTADB_SEQ}
  fields:{
    $field:[@DF_ID code:id index:id indexToken:id]
    $field:[@DF_Description code:des index:str indexToken:des
	tableToken:des]
    FASTASeq:$field:[@DF_DNASequence token:sequence format:fasta]
  }
]

FASTAPEPS_FORMAT:$libformat:[syntax:@FASTADB_SYNTAX contains:@PROTSEQ_DATA
  tableFormat:left 
  fileType:{@FASTADB_FILE @FASTADB_SEQ}
  fields:{
    $field:[@DF_ID code:id index:id indexToken:id]
    $field:[@DF_Description code:des index:str indexToken:des
	tableToken:des]
    FASTAPep:$field:[@DF_ProtSequence token:sequence format:fasta]
  }
]

FASTADB_SYNTAX:$syntax:[file:"SRSDB:fastafile.is" ignore:" \t"]

FASTADB_FILE:$filetype:[text maxline:10000]
FASTADB_SEQ:$filetype:[seq shareWith:@FASTADB_FILE fieldTokens:sequence 
  fipVar:seqFip]




More information about the Bio-srs mailing list

Send comments to us at biosci-help [At] net.bio.net