In article <Pine.SOL.3.95.970114164725.1957A-100000 at ursa.cus.cam.ac.uk>, "P. Kallblad" <pk215 at cus.cam.ac.uk> writes:
>I need to align sequences of proteins from PDB files with normal sequence
>files. Does anybody know about a program that converts PDB files into PIR
>format or into a similar sequence-only format?
>>Thanks for any suggestions.
>First, get the NRL_3D database, as that is basically a compendium of
all of the PDB sequences. If you still need to do it, here is the script
that we use, which works "mostly". It's in DCL, so if you're on Unix,
start translating!!!
Regards,
David Mathog
mathog at seqaxp.bio.caltech.edu
Manager, sequence analysis facility, biology division, Caltech
************************************************************
$! ***************************************************************
$! PDBTOGCG.COM
$!
$! Command procedure to convert PDB sequence(s) to GCG sequence(s).
$! It will accept the name of the PDB file as a parameter, otherwise
$! it prompts for it.
$! This only works for PROTEIN sequences!
$!
$! Be sure to modify the RUN HACKEM line to match your site!
$!
$! David Mathog, CalTech Biology, 1-DEC-1993
$! ************************************************************
$
$ on control_y then goto terminate
$ ws := "write sys$output"
$ iq := inquire/nopunctuation
$
$ if (P1 .eqs."")
$then
$ iq pdbfile "Name of the PDB file to convert to GCG sequence format "
$ if pdbfile .nes. "" then goto okpdbname
$ ws "Message from PDBtoGCG:"
$ ws "Cannot convert unless you supply the name of a PDB file!"
$ ws "Conversion aborted"
$ stop
$else
$ pdbfile = P1
$endif
$okpdbname:
$!
$ open/write tfil killmepdbtogcg.com
$ write tfil "$run gcgexrun:hackem"
$ write tfil PDBFILE
$ close tfil
$ @killmepdbtogcg.com/out=nla0:
$ delete killmepdbtogcg.com.
$!
$ define/user sys$output nla0:
$ reformat/threeintoone/protein/default hackem*.pep
$ purge hackem*.pep
$!
$! now rename them
$!
$!
$! No error checking after this point !!!!
$!
$! extract the root part of the name for constructing the final output
$! file's name
$!
$ file = F$SEARCH(PDBFILE)
$ root = f$parse(FILE,,,"NAME") ! root name of sequence
$!
$! rename any and all output files
$!
$count = 0
$TOP:
$ file = F$SEARCH("HACKEM*.PEP")
$ if(file .eqs. "")then goto byebye
$ count = count + 1
$ chunk = F$PARSE(FILE,,,"NAME")
$ chunk = chunk - "HACKEM"
$ chunk := "''ROOT'''chunk'.PEP"
$ rename 'file' 'chunk'
$ ws "Sequence ''count' in file ''chunk'"
$ goto TOP
$!
$BYEBYE:
$ delete hackem.out.
$ if count .eq. 0
$ then
$ ws "PDBtoGCG: WARNING: no sequences were found in ''PDBFILE'"
$ else
$ ws "PDBtoGCG: Extracted ''COUNT' sequences from ''PDBFILE'"
$ endif
$!
$terminate:
**********************************************************************
C HACKEM.FOR
C
C 1-DEC-1993, David Mathog, biology division, Caltech
C
C *Incredibly* simple minded program that takes a PDB file
C and spits the SEQRES records out into separate files for
C each chain for further processing by GCG REFORMAT.
C
C Input: The name of the PDB file
C Output: hackem.out
C Number of Chains
C Names of files = hackem.chain_id
C hackem_1.pep
C hackem_A.pep
C hackem_2.pep etc.
C
C Note, if a chain has NO NAME there will be a space in the 12
C position. In this case, the single output sequence will be named
C hackem.pep.
C
C
C SEQRES line format is:
C 6A1 SEQRES
C I4 Serial # for current chain
C 1X space
C A1 Chain identifier (any symbol!)
C 1X space
C I4 Number of residues in this chain
C 1X space
C 13(1X,A3) Residue names, in three letter code format
C
C plus sometimes junk on the end
C
implicit none
character*200 file,listof,outfile
character*132 line
character*1 chain
integer*4 status,linelen,filelen,count,i,outlen
logical OK
c
write(6,*)'Name of the PDB file to process: '
read(5,'(q,a)',iostat=status)filelen,file(1:filelen)
if(status .ne. 0 .or. filelen .eq. 0)then
write(6,*)'HACKEM: using default input file of HACKEMPDB'
file = 'HACKEMPDB'
filelen=9
end if
c
c open the input file
c
open(unit=10,file=file(1:filelen)
1 ,form='FORMATTED',carriagecontrol='LIST',status='OLD'
1 ,READONLY,iostat=status)
if(status .ne. 0)
1 stop 'HACKEM: fatal error, could not open PDB file'
c
c scan it for SEQRES records
c The initial chain value should NEVER occur in a PDB file!
c
ok = .true.
chain = char(1)
count = 0
c
do while(ok)
read(10,'(q,a)',iostat=status)linelen,line(1:linelen)
if(status .eq. 0)then
if(line(1:6) .eq. 'SEQRES')then
if(chain .ne. line(12:12))then !new chain
chain = line(12:12)
count = count + 1
listof(count:count) = chain
if(count.gt.0)close(11,dispose='SAVE')
call makename(outfile,outlen,chain)
open(unit=11,file=outfile(1:outlen)
1 ,form='FORMATTED',carriagecontrol='LIST',status='NEW'
1 ,iostat=status)
if(status .ne. 0)
1 stop 'HACKEM: fatal error, could not open output file'
write(11,'(a)')'Sequence of chain '//chain//' from '
1 //file(1:filelen)
write(11,'(a)')'..'
end if
if(linelen .gt. 70)linelen=70
write(11,'(a)')line(20:linelen)
else
if(count .ge. 1)ok = .false.
end if
else !possibly an error, but treat it line an EOF
ok = .false.
end if
end do
c
c write out hacker.out - number of chains and names of files
c
if(count.ge.1)close(unit=11,dispose='SAVE')
open(unit=11,file='hackem.out'
1 ,form='FORMATTED',carriagecontrol='LIST',status='NEW'
1 ,iostat=status)
if(status .ne. 0)
1 stop 'HACKEM: fatal error, could not create "hackem.out"'
write(11,'(I4)')count
if(count.ge.1)then
do i = 1, count
call makename(outfile,outlen,listof(i:i))
write(11,'(a)')outfile(1:outlen)
end do
end if
c
close(unit=11,dispose='SAVE')
c
stop 'HACKEM: processing completed'
end
subroutine makename(file,flen,chain)
implicit none
character*200 file
character*1 chain
integer*4 flen
if(chain .eq. ' ')then
file = 'hackem.pep'
flen = 10
else
file = 'hackem_'//chain//'.pep'
flen = 12
end if
return
end