In article <58kjn3$5to at usenet81.supernews.com>, duvickj at phibred.com (Jon Duvick) writes:
>Are there any programs out there that will parse FASTA files
>(specifically, text files containing multiple sequences in FASTA
>format) into common database formats such as tab-delimited?
Well, a general solution I don't have. Locally we use a little fortran
program called DELIMIT to insert the tabs, commas, or what have you. (I
prefer commas - tab delimited text is miserable to try and type, view, or
edit.) Problem is, FASTA files can look like either:
>name
80 chars
80 chars more
80 chars more
40 chars -> total 280
or
>name
280 chars, total
The latter form can be really messy to deal with. Assuming that you have
just the first form, DELIMIT will work. (It uses some VAX F77 extensions,
but I've not seen a compiler lately that doesn't have them.)
You can also do this sort of formatting with most text editors, just
do global search and replace on "A"-> "A<tab>", "G"->"G<tab>" etc.
Regards,
David Mathog
mathog at seqaxp.bio.caltech.edu
Manager, sequence analysis facility, biology division, Caltech
**********************************************************************
c DELIMIT.FOR
C 3-MAR-1992 David Mathog, Biology Division, Caltech
C This little program reads through a formatted
C file and replaces "char" by "char""delimit", where
C delimit is usually a comma or a tab.
C
C WARNING!!!!
C
C Quicky version, with almost no error checking
C *****************************************************
c
implicit none
character*80 infile,outfile
character*1000 templine,outline
character*1 delim
integer inlen,itrim,i,ipos
c
c
write(6,*)'DELIMIT'
write(6,*)'This program performs the following operations on each line'
write(6,*)' 1. Replaces "char" by "char delimiting-char"'
write(6,*)' 2. Optionally, first N spaces can be NOT delimited'
write(6,*)' '
write(6,*)' Example, delimiter = ",", skip first 10 spaces'
write(6,*)' '
write(6,*)' 1234567890abcdefghijklmnop'
write(6,*)' becomes'
write(6,*)' 1234567890,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p'
write(6,*)' '
write(6,*)' ( Note that if you specify skip = 0 then'
write(6,*)' '
write(6,*)' 1234567890abcdefghijklmnop'
write(6,*)' becomes'
write(6,*)' ,1,2,3,4,5,6,7,8,9,0,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p'
write(6,*)' )'
write(6,*)' '
write(6,*)' '
c
write(6,*)'Input the name of the input file'
read(5,'(q,a)')inlen,infile(1:inlen)
c
open(unit=10,file=infile(1:inlen)
1 ,form='FORMATTED',carriagecontrol='LIST',status='OLD',
1 READONLY)
c
write(6,*)'Input the name of the output file'
read(5,'(q,a)')inlen,outfile(1:inlen)
c
open(unit=11,file=outfile(1:inlen)
1 ,form='FORMATTED',carriagecontrol='LIST',status='NEW',recl=1000)
c
write(6,*)' '
write(6,*)'Which delimiting character should be used'
write(6,*)'<RETURN> for comma (default)'
read(5,'(q,a)')inlen,templine(1:inlen)
if(inlen.eq.0)then
delim=','
else
delim=templine(1:1)
end if
write(6,*)'The delimiter character will be >',delim,
1 '<, ascii value=',ichar(delim)
c
write(6,*)' '
write(6,*)'How many characters at the beginning of the line'
write(6,*)'are NOT to be delimited? ( .ge. 0)'
read(5,*)itrim
if(itrim.lt.0)stop 'values must be >=0'
write(6,*)' '
c
c Loop through the files and do it
c
do while (.true.)
read(10,'(q,a)',end=1100,err=1000)inlen,templine(1:inlen)
if(itrim.gt.0)outline(1:itrim)=templine(1:itrim)
ipos=itrim+1
outline(ipos:ipos)=delim
ipos=ipos+1
do i=itrim+1,inlen
outline(ipos:ipos)=templine(i:i)
ipos=ipos+1
outline(ipos:ipos)=delim
ipos=ipos+1
end do
c
c ipos-2 because don't want a comma at the end of the line
c
write(11,'(a)')outline(1:ipos-2)
end do
1000 continue
write(6,*)'Lethal error'
1100 continue
write(6,*)'Done'
close(unit=10)
close(unit=11)
stop 'CommaDelimit: processing completed'
end