IUBio

convert PDB format to sequence

mathog at seqaxp.bio.caltech.edu mathog at seqaxp.bio.caltech.edu
Tue Jan 14 15:15:07 EST 1997


In article <Pine.SOL.3.95.970114164725.1957A-100000 at ursa.cus.cam.ac.uk>, "P. Kallblad" <pk215 at cus.cam.ac.uk> writes:
>I need to align sequences of proteins from PDB files with normal sequence
>files. Does anybody know about a program that converts PDB files into PIR
>format or into a similar sequence-only format?
>
>Thanks for any suggestions.
>
First, get the NRL_3D database, as that is basically a compendium of
all of the PDB sequences.  If you still need to do it, here is the script
that we use, which works "mostly".  It's in DCL, so if you're on Unix,
start translating!!!

Regards,

David Mathog
mathog at seqaxp.bio.caltech.edu
Manager, sequence analysis facility, biology division, Caltech 
************************************************************
$! ***************************************************************
$! PDBTOGCG.COM
$!
$! Command procedure to convert PDB sequence(s) to GCG sequence(s).
$! It will accept the name of the PDB file as a parameter, otherwise
$! it prompts for it.
$! This only works for PROTEIN sequences!
$!
$! Be sure to modify the RUN HACKEM line to match your site!
$!
$! David Mathog, CalTech Biology, 1-DEC-1993
$! ************************************************************
$ 
$       on control_y then goto terminate
$       ws := "write sys$output"
$       iq := inquire/nopunctuation
$
$ if (P1 .eqs."")
$then
$  iq pdbfile "Name of the PDB file to convert to GCG sequence format "
$  if pdbfile .nes. "" then goto okpdbname
$  ws "Message from PDBtoGCG:"
$  ws "Cannot convert unless you supply the name of a PDB file!"
$  ws "Conversion aborted"
$  stop
$else
$  pdbfile = P1
$endif
$okpdbname:
$!
$ open/write tfil killmepdbtogcg.com
$ write tfil "$run gcgexrun:hackem"
$ write tfil PDBFILE
$ close tfil
$ @killmepdbtogcg.com/out=nla0:
$ delete killmepdbtogcg.com.
$!
$ define/user sys$output nla0:
$ reformat/threeintoone/protein/default hackem*.pep
$ purge hackem*.pep
$!
$! now rename them
$!
$!
$!  No error checking after this point !!!!
$!
$! extract the root part of the name for constructing the final output 
$! file's name
$!
$ file = F$SEARCH(PDBFILE)
$ root = f$parse(FILE,,,"NAME") ! root name of sequence
$!
$! rename any and all output files
$!
$count = 0
$TOP:
$ file = F$SEARCH("HACKEM*.PEP")
$ if(file .eqs. "")then goto byebye
$ count = count + 1
$ chunk = F$PARSE(FILE,,,"NAME")
$ chunk = chunk - "HACKEM"
$ chunk := "''ROOT'''chunk'.PEP"
$ rename 'file' 'chunk'
$ ws "Sequence ''count' in file ''chunk'"
$ goto TOP
$!
$BYEBYE:
$ delete hackem.out.
$ if count .eq. 0
$ then 
$   ws "PDBtoGCG: WARNING: no sequences were found in ''PDBFILE'"
$ else
$   ws "PDBtoGCG: Extracted ''COUNT' sequences from ''PDBFILE'"
$ endif
$!
$terminate:
**********************************************************************
C	HACKEM.FOR
C
C       1-DEC-1993, David Mathog, biology division, Caltech
C
C	*Incredibly* simple minded program that takes a PDB file
C	and spits the SEQRES records out into separate files for
C	each chain for further processing by GCG REFORMAT.
C
C	Input:  The name of the PDB file
C	Output: hackem.out
C	           Number of Chains
C	           Names of files = hackem.chain_id
C	        hackem_1.pep
C	        hackem_A.pep
C	        hackem_2.pep etc.
C
C       Note, if a chain has NO NAME there will be a space in the 12
C	position.  In this case, the single output sequence will be named
C	hackem.pep.
C
C
C	SEQRES line format is:
C	     6A1     SEQRES
C             I4     Serial # for current chain
C	      1X     space
C	      A1     Chain identifier (any symbol!)
C	      1X     space
C	      I4     Number of residues in this chain
C	      1X     space
C       13(1X,A3)    Residue names, in three letter code format
C
C	plus sometimes junk on the end
C
	implicit none
	character*200  file,listof,outfile
	character*132  line
	character*1    chain
	integer*4      status,linelen,filelen,count,i,outlen
	logical        OK
c
	write(6,*)'Name of the PDB file to process: '
	read(5,'(q,a)',iostat=status)filelen,file(1:filelen)
	if(status .ne. 0 .or. filelen .eq. 0)then
	   write(6,*)'HACKEM: using default input file of HACKEMPDB'
	   file = 'HACKEMPDB'
	   filelen=9
	end if
c
c	open the input file
c
	open(unit=10,file=file(1:filelen)
	1 ,form='FORMATTED',carriagecontrol='LIST',status='OLD'
	1 ,READONLY,iostat=status)
	if(status .ne. 0)
	1  stop 'HACKEM: fatal error, could not open PDB file'
c
c	scan it for SEQRES records
c	The initial chain value should NEVER occur in a PDB file!
c
	ok    = .true.
	chain = char(1)
	count = 0
c
	do while(ok)
	   read(10,'(q,a)',iostat=status)linelen,line(1:linelen)
	   if(status .eq. 0)then
	     if(line(1:6) .eq. 'SEQRES')then
	        if(chain .ne. line(12:12))then	!new chain
	          chain = line(12:12)
	          count = count + 1
	          listof(count:count) = chain
	          if(count.gt.0)close(11,dispose='SAVE')
	          call makename(outfile,outlen,chain)
	          open(unit=11,file=outfile(1:outlen)
	1         ,form='FORMATTED',carriagecontrol='LIST',status='NEW'
	1         ,iostat=status)
	          if(status .ne. 0)
	1            stop 'HACKEM: fatal error, could not open output file'
	          write(11,'(a)')'Sequence of chain '//chain//' from '
	1         //file(1:filelen)
	          write(11,'(a)')'..'
	        end if
	        if(linelen .gt. 70)linelen=70
	        write(11,'(a)')line(20:linelen)
	     else
	        if(count .ge. 1)ok = .false.
	     end if
	   else			!possibly an error, but treat it line an EOF
	     ok = .false.
	   end if
	end do
c
c	write out hacker.out - number of chains and names of files
c
	if(count.ge.1)close(unit=11,dispose='SAVE')
	open(unit=11,file='hackem.out'
	1  ,form='FORMATTED',carriagecontrol='LIST',status='NEW'
	1  ,iostat=status)
	if(status .ne. 0)
	1  stop 'HACKEM: fatal error, could not create "hackem.out"'
	write(11,'(I4)')count
	if(count.ge.1)then
	  do i = 1, count
	    call makename(outfile,outlen,listof(i:i))
	    write(11,'(a)')outfile(1:outlen)
	  end do
	end if
c
	close(unit=11,dispose='SAVE')
c
	stop 'HACKEM: processing completed'
	end

	subroutine makename(file,flen,chain)
	implicit none
	character*200 file
	character*1  chain
	integer*4    flen
	if(chain .eq. ' ')then
	  file = 'hackem.pep'
	  flen  = 10
	else
	  file = 'hackem_'//chain//'.pep'
	  flen  = 12
	end if
	return
	end




More information about the Info-gcg mailing list

Send comments to us at biosci-help [At] net.bio.net