question about SRS5.0
Tao Jiang
jiangt at pku.edu.cn
Wed Dec 11 20:54:42 EST 1996
I am not sure if I am missing something here,
but I always have problems with SRS5.0.
I want to test with only phg.dat of EMBL,
which is a very small data file. With SRS4.08,
indexing takes only 1 min., while with SRS5.0,
it will take too long for me to wait it ends.
I modified the file icarus.c, embl.i and embl.is just
as Thure suggested and compiled icarus.c and run 'srssection'
and 'srsupdate', but it still blocks (maybe it will ends
after a very long time?).
I attach my embl.i, embl.is at the end, and any suggestions
are greatly appreciated.
--
Tao Jiang Fax: 86-10-62751982
Network Center Homepage: http://jiangt.pku.edu.cn/~jiangt/
Peking University Email: jiangt at pku.edu.cn
-------------- next part --------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# $RCSfile: embl.i,v $
# $Revision: 1.11 $
# $Date: 1996/12/06 22:17:37 $
# $Author: etzold $
#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#EMBLNEW_DB:$library:[EMBLNEW group:@SEQUENCE_LIBS
# format:@EMBL_FORMAT cachesize:2048 maxNameLen:15
# subentries:@EmblnewFeatures_DB
# files:$file:emnew
#]
EMBL_DB:$library:[EMBL group:@SEQUENCE_LIBS
partSize:1000000
comment:" Nucleotide Sequence Database from EBML"
subentries:@EmblFeatures_DB
format:@EMBL_FORMAT
cachesize:2048
maxNameLen:30 files:{
# $file:est1
# $file:est2
# $file:est3
# $file:est4
# $file:est5
# $file:est6
# $file:est7
# $file:fun
# $file:gss
# $file:hum1
# $file:hum2
# $file:inv
# $file:mam
# $file:org
# $file:patent
# $file:phg
# $file:pln
# $file:pro
# $file:rod
# $file:sts
# $file:syn
# $file:unc
# $file:vrl
# $file:vrt
# $file:em_est1
# $file:em_est2
# $file:em_est3
# $file:em_est4
# $file:em_est5
# $file:em_est6
# $file:em_est7
# $file:em_ba
# $file:em_fun
# $file:em_gss
# $file:em_hum1
# $file:em_hum2
# $file:em_in
# $file:em_om
# $file:em_or
# $file:em_ov
# $file:em_pat
$file:em_ph
# $file:em_pl
# $file:em_pr
# $file:em_ro
# $file:em_sts
# $file:em_sy
# $file:em_un
# $file:em_vi
}
]
EMBL_FORMAT:$libformat:[syntax:@EMBL_SYNTAX
fileType:{@DAT_FILE @SEQ_FILE} #orig format
# fileType:{@GCGREF_FILE @GCGSEQ_FILE} #GCG format
fields:{
$field:[@DF_ID code:id index:id indexToken:id]
$field:@DF_ALL
$field:[@DF_Accession code:acc index:str indexToken:acc
tableToken:acc tableFormat:left]
$field:[@DF_Division code:id index:str indexToken:div
tableToken:div tableFormat:left]
$field:[@DF_Molecule code:id index:str indexToken:mol
tableToken:mol tableFormat:left]
# $field:[@DF_DBOrigin code:acc index:str indexToken:dbOri
# tableToken:dbOri tableFormat:left]
# $field:[@DF_AccessionKey code:acc index:str indexToken:accKey
# tableToken:accKey tableFormat:center]
$field:[@DF_Description code:des index:str indexToken:des
tableToken:t_des tableFormat:left]
$field:[@DF_Keywords code:key index:str indexToken:key
tableToken:key tableFormat:left]
$field:[@DF_Organism code:org index:str indexToken:org]
$field:[@DF_Authors code:ra index:str indexToken:authors
tableToken:authors tableFormat:left]
$field:[@DF_Date code:date index:int indexToken:date
tableToken:date tableFormat:center]
$field:[@DF_SeqLength code:sq index:int indexToken:seqLen
tableToken:seqLen tableFormat:right]
$field:[@DF_LINK code:dr]
# $field:[@DF_DNASequence token:gcgseq format:embl] #GCG format
$field:[@DF_DNASequence token:sequence format:embl] #orig format
$field:[@DF_HeaderField name:'Feature Table Fields']
$field:[@DF_FtKey code:ft index:str indexToken:ftKey
indexId:@SUBENTRY_ID tableToken:ftKey tableFormat:left]
$field:[@DF_FtQualifier code:ft index:str indexToken:ftQual
indexId:@SUBENTRY_ID]
$field:[@DF_PID code:ft index:str indexToken:pid
indexId:@SUBENTRY_ID]
$field:[@DF_FtDescription code:ft index:str indexToken:ftDes
indexId:@SUBENTRY_ID]
$field:[@DF_FtSource code:ft index:str indexToken:ftSrc
indexId:@SUBENTRY_ID]
$field:[@DF_ChrsNo code:ft index:str indexToken:chrsNo
indexId:@SUBENTRY_ID]
$field:[@DF_FtMap code:ft index:str indexToken:map
indexId:@SUBENTRY_ID]
}
]
EMBL_SYNTAX:$syntax:[file:"SRSDB:embl.is" ignore:" \t"]
$syntax:[name:ftseq file:"SRSDB:ftseq.is" ignore:" \t\n"]
#$link:[@EMBL_DB to:@?GENBANK_DB toField:@DF_ACCNO]
#$link:[@EMBL_DB to:@?PIR_DB toField:@DF_ACCNO]
#$link:[@EMBL_DB to:@?REBASE_DB toField:@DF_ID]
#$link:[@EMBL_DB to:@?OMIM_DB toField:@DF_ID]
#$link:[@EMBL_DB to:@?MEDLINE_DB toField:@DF_ID]
EmblFeatures_DB:$library:[EMBL_features format:@EmblFeature_Format]
EmblnewFeatures_DB:$library:[EMBLNEW_features format:@EmblFeature_Format]
EmblFeature_Format:$libformat:[syntax:@EMBL_SYNTAX #idType:@SUBENTRY_ID
tableFormat:left
fields:{
$field:[@DF_ID token:ftId]
# $field:[@DF_Accession code:acc fromParent:y]
# $field:[@DF_Description code:des fromParent:y]
$field:[@DF_FtKey token:ft tableToken:'t_ft|key']
$field:[@DF_FtQualifier token:ft tableToken:'t_ft|qual']
$field:[@DF_PID token:ft tableToken:'t_ft|db_xref']
$field:[@DF_FtDescription token:ft tableToken:'t_ft|note']
$field:[@DF_FtGene token:ft tableToken:'t_ft|gene']
$field:[@DF_FtProduct token:ft tableToken:'t_ft|product']
$field:[@DF_FtPartial token:ft tableToken:'t_ft|partial']
$field:[@DF_FtPseudo token:ft tableToken:'t_ft|pseudo']
$field:[@DF_FtNumber token:ft tableToken:'t_ft|number']
$field:[@DF_FtSource token:ft tableToken:ftSrc]
$field:[@DF_ChrsNo token:ft tableToken:'t_ft|chromosome']
$field:[@DF_FtMap token:ft tableToken:'t_ft|map']
$field:[@DF_DNALocation token:ftLocat tableFormat:listing]
}
]
DF_FtGene:$srsfield:[Gene short:gen]
DF_FtProduct:$srsfield:[Product short:pro]
DF_FtPartial:$srsfield:[Partial short:par]
DF_FtPseudo:$srsfield:[Pseudo short:pse]
DF_FtNumber:$srsfield:[Number short:num]
-------------- next part --------------
#!/bin/env icarus
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# $RCSfile: embl.is,v $
# $Revision: 1.10 $
# $Date: 1996/12/06 22:17:38 $
# $Author: etzold $
#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
$rules={
entry: ~ {$In:[file:text] $Out pre $Skip:0}
('ID ' {$not} ln)*
('ID '{$entryFip=$Fip $Wrt} ln {$App}
(/( |ID|>>)/ {$Not} ln {$App})+ )? ~
# data fields
fields: ~ {$In:entry $Out $Skip:1}
id sep acc sep nid sep date sep f_des sep f_key sep
(src tax? sep f_org? sep)+ (rn rc? rp? rx? ra rt? rl? sep)*
link? sep (cmnt sep)* (fh f_ft*)? sep sq
~
sep: ~ {$Wrt:sep} ('XX' ln)* ~
id: ~ {$Wrt:id} 'ID' / *([A-Z0-9_]+)/
{$entryName=$1} ln ~
acc: ~ {$Wrt:acc} ('AC' ln)+ ~
nid: ~ {$Wrt:nid} ('NI' ln)* ~
date: ~ {$Wrt:date} ('DT' ln)+ ~
f_des: ~ {$Wrt:des} ('DE' ln)+ ~
src: ~ {$Wrt:org} ('OS' ln)+ ~
tax: ~ {$Wrt:org} ('OC' ln)+ ~
f_org: ~ {$Wrt:org} ('OG' ln)+ ~
rn: ~ {$Wrt:rn} ('RN' ln)+ ~
rc: ~ {$Wrt:rc} ('RC' ln)+ ~
rp: ~ {$Wrt:rp} ('RP' ln)+ ~
rx: ~ {$Wrt:rx} ('RX' ln)+ ~
ra: ~ {$Wrt:ra} ('RA' ln)+ ~
rt: ~ {$Wrt:rt} ('RT' ln)+ ~
rl: ~ {$Wrt:rl} ('RL' ln)+ ~
cmnt: ~ {$Wrt:cmnt} ('CC' ln)+ ~
link: ~ {$Wrt:link} ('DR' ln)+ ~
f_key: ~ {$Wrt:key} ('KW' ln)* ~
fh: ~ {$Wrt:fh} ('FH' ln)+ ~
f_ft: ~ {$Wrt:ft} 'FT' ln ('FT ' ln)* ~
sq: ~ {$Wrt:sq} 'SQ' ln ~
# parsing the sequence part from separate stream
gcgseq: ~ { $In:[file:seq] $Out pre $s=$SeqNew
$SeqMake:$s $Wrt:[s:$entryName]
}
'>>>>' {$seqFip=$Fip} (/[A-Z0-9]+/ seq |
/[A-Z0-9]+_0/ seq (/>>>>[A-Z0-9]+_[1-9]+/
{$SeqTrunc:[$s len:10000]} seq)+) ~
seq: ~ (/.*2BIT *Len:/ /[0-9]+/ {$len=$Ct} ln ln
{$SeqGet2Bit:[$s file:$File len:$len]} ('>>>>'{$Not} ln)*|
/.*ASCII/ ln ln ('>>>>' {$Not} /.*/ {$SeqApp:[$s s:$Ct]})+) ~
sequence: ~ {$In:[file:seq share:text] $Out pre {$s=$SeqNew $seqFip=$Fip}
$Wrt:[s:$entryName] $SeqMake:$s}
('ID' {$Not} ln {$SeqApp:[$s s:$Ct]})* ~
# for indexing
i_id: ~ {$In:[entry] $Out:id} /ID +/ name {$Wrt} ~
i_div: ~ {$In:[fields c:id] $Out:div}
/ID +/ name /[^;]+/ ';' /[^;]+/ ';' name {$Wrt} ~
i_mol: ~ {$In:[fields c:id] $Out:mol}
/ID +/ name /[^;]+/ ';' /[^;]+/ {$Wrt} ~
i_acc: ~ {$In:[fields c:acc] $Out:acc} ('AC' (name {$Wrt} ';')+)+ ~
i_nid: ~ {$In:[fields c:nid] $Out:nid} 'NI' name {$Wrt} ~
i_dates: ~ { $In:[fields c:date] $Out:date
init $month={JAN:1 FEB:2 MAR:3 APR:4 MAY:5 JUN:6 JUL:7 AUG:8
SEP:9 OCT:10 NOV:11 DEC:12}
}
/.* ([0-9]+)-([A-Z]+)-([0-9]+)[^\n]+Cre/
{$Wrt:[credate s:($1 + $month.$2*100 + $3*10000)]} ~
des: ~ {$In:[fields c:des] $Out}
('DE' (/\\(EC *([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)/
{$Wrt:[s:$1]}| word{$Wrt} | sp)+ ln)+ ~
org: ~ {$In:[fields c:org] $Out}
('OC' (/[^;.\n]+/ {$Wrt} | /[^\n]/)*)+ |
'OS' /[^(\n]+/ {$Wrt:[s:$Trim:$Ct]}
('(' (/[^ \n)]+/ | /[^)]/)+)? |
'OG' /[^\n;.]+/ {$Wrt} ~
key: ~ {$In:[fields c:key] $Out}
('KW' (/[^\n;.]+/ {$Wrt} | /[^\n]/)+)+ ~
i_authors: ~ {$In:[fields c:ra] $Out:authors}
(/RA/ (/([^.,\n]+) +([^,;\n]+)/ {$Uniq:[s:"$1,$2"]} /[,;]/)*
ln)* ~
i_accKey: ~ {$In:[fields c:acc] $Out:accKey} 'AC' /[A-Z]+/ {$Wrt} ~
i_dbOri: ~ {$In:[fields c:acc] $Out:dbOri
init { $dbOriN={
A:1 F:1 V:1 X:1 Y:1 Z:1
AA:2 AC:2 AD:2 B:2 G:2 H:2 I:2 J:2 K:2
L:2 M:2 N:2 R:2 S:2 T:2 U:2 W:2
AB:3 C:3 D:3 E:3}
$dbName={1:EMBL 2:GenBank 3:DDBJ}
}
}
'AC' /[A-Z]+/ {$Wrt:[s:$dbName.$dbOriN.$Ct]} ~
i_seqLen: ~ {$In:[fields c:sq] $Out:seqLen}
'SQ Sequence' /[0-9]+/ {$Wrt} ~
# indexing features
ftWord: ~ /[^" ,;:()\n.-]+/ ~ #"
ftSep: ~ /[ ,;.:()-]+/ ~
ftKey: ~ {$In:[fields c:ft count:ft var:$ftN select:$subEntryN] $Out}
'FT' /[^ ]+/ {$Wrt:[n:$ftN]} ~
ftQual: ~ {$In:[fields c:ft count:ft var:$ftN select:$subEntryN] $Out}
/[^\/]+/
(/\/([a-zA-Z0-9_]+)/ {$Wrt:[s:$1 n:$ftN]} (/=[a-zA-Z0-9_]+/ |
/="[^"]+"/)? /[^\/]+/)* ~ #"
chrsNo: ~ {$In:[fields c:ft count:ft var:$ftN select:$subEntryN] $Out}
/.+\/chromosome="?/
(/[^\n\" ]+/ {$Uniq:[n:$ftN n:0]} | '\nFT' | ' ')+ ~ #"
ftSrc: ~ {$In:[fields c:ft count:ft var:$ftN] $Out}
(/[^\/]+\/(tissue_type|cell_line|organism|strain|dev_stage|sex|clone_lib)="/
('\nFT' | ftWord {$Uniq:[n:$ftN]} | ftSep)+ | '/')* ~
ftDes: ~ {$In:[fields c:ft count:ft var:$ftN select:$subEntryN] $Out}
(/[^\/]+\/(product|note|gene)="/
('\nFT' | /NCBI gi: *[0-9]+/ |
ftWord {$Uniq:[n:$ftN]} | ftSep)+ | '/')* ~
map: ~ {$In:[fields c:ft count:ft var:$ftN] $Out}
/.+\/map="/ (/[^a-zA-Z0-9"]+/|/[a-zA-Z0-9]+/{$Wrt:[n:$ftN]})+ #"
~
pid: ~ {$In:[fields c:ft count:ft var:$ftN select:$subEntryN] $Out}
(/db_xref="?PID:([0-9a-zA-Z]+)/ {$Wrt:[n:$ftN s:$1]} |
/[^\/]+\// )+ ~
# displaying features
ftClean: ~ {$In:[fields c:ft count:ft var:$ftN select:$subEntryN] $Out }
'FT' ln {$Wrt} ('FT' ln {$App})* ~
ftLoc: ~ {$In:ftClean $Out} /[^ ]+/ /[^\/]+/ {$Wrt} ~
ft: ~ {$In:[fields c:ft count:ft var:$ftN select:$subEntryN] $Out}
/.*/ {$Wrt} ~
ftId: ~ {$In:[fields c:id] $Out}
/.. *([A-Z0-9]+)/ {$Wrt:[s:"ID $1\_$subEntryN; parent: $1"]}~
h_ftId: ~ {$In:[ftId t:html]} /.*parent: */
/[A-Z0-9]+/ {$Rep:{$ParStr:emblIdR $Ct $Ct}} ~
t_ft: ~ {$In:ftClean $Out} /(<A HREF)?[^ ]+/ {$Wrt:key}
/[^\/]+/ t_qual* ~
t_qual: ~ /[ \n]*\/([^ \n=]+)/ {$qn=$1 $Uniq:[qual s:$1]}
qualval? {$Wrt:[$qn s:$qv]} ~
qualval: ~ {pre $qv=""} /=([a-zA-Z0-9_]+)/ {$qv=$1} |
/="([^"]+)"/ {$qv=$1} | x{$qv='+'} ~
# extracting feature sequences"
ftCleanIter: ~ {$In:[fields c:ft] $Out }
'FT' ln {$Wrt} ('FT' ln {$App})* ~
ftLocat: ~ {$In:ftClean $Out init {$SdbFunctions}}
/(<A HREF)?[^ ]+/ /[^\/]+/ {$Wrt} ~
# printing tables
t_authors: ~ {$In:[fields c:refaut] $Out} 'RA' ln {$Wrt} ('RA' ln {$App})* ~
t_des: ~ {$In:[fields c:des] $Out} 'DE' ln {$Wrt} ('DE' ln {$App})* ~
# printing in HTML format
h_ft: ~ {$In:[fields c:ft t:html count:ft var:$ftN]}
'FT' /[^ ]+/ {$Rep:{$ParStr:emblFeatR $entryName $ftN $Ct}}
# /[^\/]+/
# (/\/([a-zA-Z0-9_]+)/ {$Rep:{$ParStr:emblFtQualR $1 $Ct}}
# (/=[a-zA-Z0-9_]+/ | /="[^"]+"/)? /[^\/]+/)*
~
h_id: ~ {$In:[fields c:id t:html]} /../ name {$entryName=$Ct} ~
h_rx: ~ {$In:[fields c:rx t:html]}
/../ 'MEDLINE;' /[0-9]+/ {$Rep:{$ParStr:medlineR $Ct $Ct}} ~
htmlTag: ~ /<[^<>]*>/ ~
h_links: ~ {$In:[fields c:link t:html]
init{$hl={
'SWISS-PROT': swissR
DICTYDB: dictydbR GCRDB: gcrdbR
MAIZEDB: maizedbR WORMPEP: wormpepR
LISTA: listaR
PIR: pirR YEPD: yepdR
SGD: sgdR
STYGENE: stygeneR
HIV: hivR
ECOGENE: ecogeneR ECO2DBASE: eco2dbaseR
MIM: mimR SUBTILIST: subtilistR
FLYBASE: flybaseR
TRANSFAC: transfacR REBASE: rebaseR
}
}
}
(/DR/ dbName {$db=$Ct} ';'
accno {$Rep:{$ParStr:$hl.$db $Ct $Ct}} ln)*
~
# other stuff
word: ~ /[^" ,;:()\/=\n.-]+/ ~
sp: ~ /[ ,;.:()\/=-]+/ ~
lnCode: ~ /\n[A-Z][A-Z]/ ~
ln: ~ /[^\n]*\n/ ~
accno: ~ /[A-Z0-9]+/ ~
num: ~ /(<|>)?[0-9?]+/ ~
name: ~ /[a-zA-Z0-9_-]+/ ~
dbName: ~ /[^;]+/ ~
}
if:$TestMode {
$job = $JobNew:[prod:$rules skip:" " fileName:'/data/embl_dna/mam.dat']
while:$JobHasInput:$job {
$JobTokens:[$job name:acc print:0]
$JobTokens:[$job name:t_ft print:1]
$JobNext:$job
#$print:"-------->entry\n"
}
}
More information about the Bio-srs
mailing list
Send comments to us at biosci-help [At] net.bio.net