David Ghosh recently sent out a warning that TFD 7.0's sites.dat file would
have a slightly different format. This change will break the previous
version of TFDtoGCG that was posted to this newsgroup a few months ago.
The new version of TFDtoGCG appended to the end of this message should work
with TFD 7.0. (But not with TFD <7.0 ). I say "should" because the data
set hasn't been released yet, and so the modified program has not been
tested. No problems are expected since the changes were to the record
structure only.
As usual, no warranties, etc.
David Mathog
mathog at seqvax.bio.caltech.edu
Manager, sequence analysis facility, biology division, Caltech
--8<----8<----8<----8<----8<- CUT HERE ---8<----8<----8<----8<----8<--
C TFDTOGCG.FOR
C 9-APR-1993 David Mathog, Division of Biology, Caltech
C mathog at seqvax.bio.caltech.edu
C
C This little program takes one of David Ghosh's site.dat
C files and reformats it for GCG usage.
C
C Put the output file into the GCG system as: GENMOREDATA:TFSITES.DAT
C
C FTP to NCBI.NLM.NIH.GOV and look in repository/TFD/tfd.ascii
C for the site.dat file.
C
C This works with GCG 7.2 and VMS 5.5-2 and *may* work on other
C systems (not tested).
C
C Instructions for building an executable:
C
C $ for/nolis tfdtogcg
C $ link/nomap tfdtogcg
C
C Example session (program's prompts not shown):
C
C $ run tfdtogcg
C site.dat
C temporary.out
C 9-APR-1993, Converted TFD X.Y to GCG format
C
C $ copy temporary.out genmoredata:tfsites.dat
C $ set file/prot=w:re genmoredata:tfsites.dat
C $ delete temporary.out.
C
C There is practically NO error checking, so watch out!
C
C Revision 1.0 24-JUN 1993, David Mathog
C TFD 7.0 changes in sites file are:
C GENOME*1 removed
C TRN_UNIT*20 resized to 30
C Modified ghosh_record to reflect these changes.
C
implicit none
character*2048 inline,outline,infile,outfile
integer*4 inlen,istat
integer*4 recsize
logical ok
c
c Ghosh lays out site records like this in TFD 7.0
c
structure /ghosh_record/
character SITE_ID*6
character FAC_NAME*25
character SEQ_NAME*30
character NA_SEQ*45
character SEQ_TYPE*1
character SYSTEM*10
character TRN_UNIT*30
character COMMENTS*80
character MAIN_REF*60
character FAC_SOURCE*16
character LOCAT_REF*20
character LOCATION*20
character METHOD*11
character N_PROB*8
character REF_N*8
character STRAND*1
character BINDING*1
end structure
c
c GCG lays out TFSITE.DAT records like this
c
structure /GCG_RECORD/
character SEQ_NAME*31
character SPACER1*2
character NA_SEQ*45
character SPACER2*5
character FAC_NAME*25
character SPACER3*1
character MAIN_REF*60
end structure
c
record /ghosh_record/ ghosh
record /gcg_record/ gcg
c
c Init the spacers for the GCG record
c
gcg.spacer1 = '0 '
gcg.spacer2 = ' 0 ! '
gcg.spacer3 = ' '
c
write(6,*)'TFDtoGCG'
write(6,*)'This program converts one of David Ghosh''s site'
write(6,*)' files to GCG''s format'
c
write(6,*)'Input the name of the file to process'
read(5,'(q,a)')inlen,infile(1:inlen)
c
open(unit=10,file=infile(1:inlen)
1 ,form='UNFORMATTED',organization='SEQUENTIAL',status='OLD'
1 ,recordtype='VARIABLE', READONLY)
c
write(6,*)'Input the name of the output file'
read(5,'(q,a)')inlen,outfile(1:inlen)
c
open(unit=11,file=outfile(1:inlen)
1 ,form='UNFORMATTED',status='NEW',organization='SEQUENTIAL'
2 ,recordtype='STREAM_LF',recl = 255)
c
c get the comments
c
write(6,*)'Enter as many lines of comments as you would like'
write(6,*)' End each line with a <return>'
write(6,*)' End the last line with <return><return>'
ok = .true.
do while (ok)
read(5,1000)inlen,inline(1:inlen)
if(inlen.eq.0)then
ok = .false.
else
write(11)' '//inline(1:inlen)
end if
end do
write(6,*)'Working ...'
c
c write a title line, this one is *easy*
c
GCG.SEQ_NAME = 'NAME'
GCG.FAC_NAME = 'FACTOR'
GCG.NA_SEQ = 'SEQUENCE'
GCG.MAIN_REF = 'REFERENCE'
write(11)gcg
c
c Now write the divider
c
write(11)'..'
c
istat=0
do while(istat.ge.0)
read(10,iostat=istat)ghosh
1000 format(q,a)
if(istat.ge.0)then
GCG.SEQ_NAME = GHOSH.SEQ_NAME//' '
GCG.FAC_NAME = GHOSH.FAC_NAME
GCG.NA_SEQ = GHOSH.NA_SEQ
GCG.MAIN_REF = GHOSH.MAIN_REF
call fixseqname(GCG.SEQ_NAME,GHOSH.N_PROB)
write(11)gcg
end if
end do
1100 format(a)
close(unit=10)
close(unit=11)
stop 'TFDtoGCG: normal completion'
end
subroutine fixseqname(NAME,N_PROB)
character name*(*)
character N_prob*(*)
integer i,last,inlen,nlen
real limit,value
parameter (limit = 5.0e-4)
c
c Do the length this way so that it will still work if the
c length of NAME, N_PROB change.
c
inlen=len(NAME)
nlen =len(N_PROB)
c
c first put in a ";", if needed to indicate a frequent motif
c
read(N_prob,1000)value
1000 format(F<NLEN>.2)
if(value.gt.limit)NAME = ';'//NAME(1:inlen-1)
c
c now convert any internal spaces to underscores
c if it doesn't find *any* nonspaces, the name becomes "UNKNOWN"
c This is done in two passes.
c
do i = 1, inlen
if(name(i:i).eq.' ')name(i:i)='_'
end do
c
last=inlen+1
i = inlen
do while(last.eq.inlen+1 .and. i.gt.0)
if(name(i:i).ne.'_')then
last = i
else
name(i:i) = ' '
i = i-1
end if
end do
c
if(last.eq.inlen+1)name='UNKNOWN'
c
return
end