IUBio

CDS parse software for UNIX - is something available?

Michele Clamp michele at speed.ebi.ac.uk
Fri Jun 5 04:43:38 EST 1998


> Alexander Kanapin wrote:
> 
> > I am looking for softwrare (source code) for simple parsing of GenBank
> > CDS field. Does anybody know about such tools? I need some kind of
> > automatic splicing - to read CDS data and produse new sequence string
> > according to CDS coordinates of coding regions from entry sequence.
> > 
> 

The following perl script I think may do what you want.  It produces its 
protein coding sequences by translating the DNA and I've come across 
a few discrepancies between the DNA translations and the quoted 
translation in the feature table (e.g. you get stop codons turning up 
unexpectedly).  It's also human specific because of the DNA translation
so you'll have to add more coding tables if you want to use it
for other organisms.

#!/usr/local/bin/perl

# Michele Clamp 25/7/97
#
# Reads a Genbank format database file and outputs various features/
# sequences.  Originally written for outputting coding and non-coding
# sequence of complete genes.

if (! @ARGV) {
  usage();
  exit(0);
}

use Getopt::Std;
getopts('lLdaNksocniftEIeCAhS');


if ($opt_h) {
    usage();
}

# -h = help
# -l = length .
# -d = DEFINITION .
# -a = ACCESSION .
# -N = NID 
# -o = ORGANISM .
# -c = Coding sequence - dna
# -C = Coding sequence - aa
# -f = 5' sequence
# -t = 3' sequence
# -e = exon list
# -E = exon sequences
# -I = intron sequences
# -A = all sequence - dna
# -S = SRS format sequences - numbers at the right

$| = 1;

while (<>) {
    #LOCUS marks the beginning of a new entry
    if (/^LOCUS +(\S+) + (\S+)/) {
        $id = $1;
        $length = $2;

        if ($opt_l) {
            print("$length\n");
        }

        #Read the entry until //
        while (($line = <>) && ($line !~ /^\/\//)) {
            #Do the easy stuff first
            if ($line =~ /^ACCESSION +(\S+)/) {
                $acc = $1;
                if ($opt_a) {
                    print("$acc\n");
                }
            }

            if ($line =~ /^DEFINITION +(.*)\n/) {
                $def = $1;
                if ($opt_d) {
                    print("$def\n");
                }
            }
            
            if ($line =~ /ORGANISM +(.*)\n/) {
                $org = $1;
                if ($opt_o) {
                    print("$org\n");
                }
            }
            
            if ($line =~ /^NID +(.*)\n/) {
                $nid = $1;
                if ($opt_N) {
                    print("$nid\n");
                }
            }
            #We only want to process sequence stuff if we have to
            if ($opt_c || $opt_C || $opt_n || $opt_i || $opt_f || 
                $opt_t || $opt_e || $opt_b || $opt_E || $opt_I || opt_A)  {

                #Found the CDS line which defines where the exons are
                if ($line =~ /^ +CDS +/)    {
                    $cds = "";
                    #Read in the cds lines
                    while ($line !~ /\)/) {
                        chop($line);
                        $cds .= $line;
                        $line = <>;
                    }

                    chop($line);
                    $cds .= $line;
                    
                    $cds =~ s/ //g;
                    $cds =~ s/.*join\((.*)\).*/$1/;

                    #Exons are in an array
                    @exons = split(/\,/,$cds);

                    undef(@exstart);
                    undef(@exend);

                    #loop over the exons and push the
                    #start and end points into arrays
                    foreach $exon (@exons) {
                        if ($exon =~ /\.\./) {
                            ($start,$end) = split(/\.\./,$exon);
                            push(@exstart,$start);
                            push(@exend,$end);
                        } else {
                            print(STDERR "Can't split exon $exon of $id\n");
                        }
                    }

                    #Exon list - print if option
                    if ($opt_e) {
                        for ($i = 0; $i <= $#exstart; $i++) {
                            print("$exstart[$i] - $exend[$i]\n");
                        }
                        print("\n");
                    }

                } # end if ($line = CDS
                
                if ($line =~ /\/translation=\"(.*)\n/) {
                    $trans = $1;
                    while ($trans !~ /\"/) {
                        $line = <>;
                        $line =~ s/\n//g;
                        $line =~ s/ //g;
                        $trans .= $line;
                    }

                    $trans =~ s/\"//g;

                    #Take off Start codon
                    $trans = substr($trans,1);
                    $trans =~ s/(.{72})/$1\n/g;
                }
                
                #Get the sequence out - read the dna sequence
                #paste the exons together and translate
                if ($line =~ /ORIGIN/ && !$opt_S) {
                    $sum = 0;
                    $seq = "";

                    while ($sum < $length) {
                        $tmp = <>;
                        $tmp =~ s/ +\S+ +(.*)\n/$1/;
                        $tmp =~ s/ //g;
                        $seq .= $tmp;
                        $sum = length($seq);
                    }
                    if ($length != length($seq)) {
                        print(STDERR "ERROR: Sequence lengths don't match\n");
                        $len = length($seq);
                        print(STDERR "Sequence length in feature table = $length\n");
                        print(STDERR "Sequence length from entry = $len\n");
                    }
                }
                #This is for genbank entries retrieved using SRS
                #The sequence is not preceded by ORIGIN and the
                #numbers are at the end of the line as opposed to before
                if ($opt_S && ($line =~ /BASE/)) {
                    $sum = 0;
                    $seq = "";
                    
                    while (($sum < $length) && (($tmp = <>) !~ /^\/\//)) {
                        if ($tmp !~ /Length/) {
                            $tmp =~ s/ +(.*) +.*\n/$1/;
                            $tmp =~ s/ //g;
                            $seq .= $tmp;
                            $sum = length($seq);
                        }
                    }
                    if ($length != length($seq)) {
                        print(STDERR "ERROR: Sequence lengths don't match\n");
                        $len = length($seq);
                            print(STDERR "Sequence length in feature table = $length\n");
                        print(STDERR "Sequence length from entry = $len\n");
                    }
                }

            }
        } # end while (($line = <>

        if ($opt_c || $opt_C || $opt_n || $opt_i || $opt_f || 
            $opt_t || $opt_e || $opt_b || $opt_E || $opt_I || opt_A)  {
            
            #print all the dna out
            if ($opt_A) {
                ($chopseq = $seq) =~ s/(.{72})/$1\n/g;
                print(">$id\n$chopseq\n");
            }
            #Now process the CDS list for the coding dna only
            $exstr = "";
            $protein = "";
            
            for ($i = 0; $i <= $#exstart; $i++) {
                $tmpstr = substr($seq,$exstart[$i]+1,$exend[$i] - $exstart[$i]+1);
                $exstr .= $tmpstr;
                if ($opt_E) {
                    $tmpstr =~ s/(.{72})/$1\n/g;
                    $start = $exstart[$i]; $end = $exend[$i];
                    print(">${id}_exon$i $start - $end\n$tmpstr\n");
                }

            }
            
            #We also want to extract the non-coding sequence
            #We'll annotate the 5', 3' and intronic

            if ($opt_f) {
                #5' and 3' first
                $seq5 = substr($seq,0,$exstart[0]-1);
                $seq5 =~ s/(.{72})/$1\n/g;
                $end = $exstart[0] - 1;
                print(">${id}_5UTR 1 - $end\n$seq5\n");
            }
            
            if ($opt_I) {
                #Now the introns
                for ($i = 0; $i < $#exstart; $i++) {
                    $seqi = substr($seq,$exend[$i],$exstart[$i+1] - $exend[$i] - 1);
                    $start = $exend[$i]+1; $end = $exstart[$i+1] - 1;
                    $seqi =~ s/(.{72})/$1\n/g;
                        print(">${id}_intron$i $start-$end\n$seqi\n");
                }
            }

            if ($opt_t) {
                #And the 3'
                $seq3 = substr($seq,$exend[$#exend] ,$length - $exend[$#exend]);
                $seq3 =~ s/(.{72})/$1\n/g;
                $start = $exend[$#exend]+1; $end = $length;
                   print(">${id}_3UTR $start-$end\n$seq3\n\n");
            }

            if ($opt_C) {
                $protein = translate($exstr,2);
                $protein =~ s/(.{72})/$1\n/g;
                
                #Take off stop codon
                $protein =~ s/\*$//;
                print(">$id\n$protein\n");
            
                if ($protein =~ /\*../) {
                    print(STDERR "WARNING: Coding sequence $id contains stop codons\n");
                }
            }
            if ($opt_c) {
                $exstr =~ s/(.{72})/$1\n/g;
                print(">$id\n$exstr\n");
            }
        }
    } # end if (/^LOCUS...
}


sub translate  {
    my($seq,$frame) = @_;
    my($i);
    if ($frame == "") {
        $frame = 1;
    }
    
    $aa{"TTT"} = "F";
    $aa{"TTC"} = "F";
    $aa{"TTA"} = "L";
    $aa{"TTG"} = "L";

    $aa{"TCT"} = "S";
    $aa{"TCC"} = "S";
    $aa{"TCA"} = "S";
    $aa{"TCG"} = "S";

    $aa{"TAT"} = "Y";
    $aa{"TAC"} = "Y";
    $aa{"TAA"} = "*";
    $aa{"TAG"} = "*";

    $aa{"TGT"} = "C";
    $aa{"TGC"} = "C";
    $aa{"TGA"} = "*";
    $aa{"TGG"} = "W";

    $aa{"CTT"} = "L";
    $aa{"CTC"} = "L";
    $aa{"CTA"} = "L";
    $aa{"CTG"} = "L";

    $aa{"CCT"} = "P";
    $aa{"CCC"} = "P";
    $aa{"CCA"} = "P";
    $aa{"CCG"} = "P";

    $aa{"CAT"} = "H";
    $aa{"CAC"} = "H";
    $aa{"CAA"} = "Q";
    $aa{"CAG"} = "Q";

    $aa{"CGT"} = "R";
    $aa{"CGC"} = "R";
    $aa{"CGA"} = "R";
    $aa{"CGG"} = "R";

    $aa{"ATT"} = "I";
    $aa{"ATC"} = "I";
    $aa{"ATA"} = "I";
    $aa{"ATG"} = "M";

    $aa{"ACT"} = "T";
    $aa{"ACC"} = "T";
    $aa{"ACA"} = "T";
    $aa{"ACG"} = "T";

    $aa{"AAT"} = "N";
    $aa{"AAC"} = "N";
    $aa{"AAA"} = "K";
    $aa{"AAG"} = "K";

    $aa{"AGT"} = "S";
    $aa{"AGC"} = "S";
    $aa{"AGA"} = "R";
    $aa{"AGG"} = "R";

    $aa{"GTT"} = "V";
    $aa{"GTC"} = "V";
    $aa{"GTA"} = "V";
    $aa{"GTG"} = "V";

    $aa{"GCT"} = "A";
    $aa{"GCC"} = "A";
    $aa{"GCA"} = "A";
    $aa{"GCG"} = "A";

    $aa{"GAT"} = "D";
    $aa{"GAC"} = "D";
    $aa{"GAA"} = "E";
    $aa{"GAG"} = "E";

    $aa{"GGT"} = "G";
    $aa{"GGC"} = "G";
    $aa{"GGA"} = "G";
    $aa{"GGG"} = "G";

    $seq =~ tr/a-z/A-Z/;

    $start = $frame-1;

    for ($i = $start; $i < length($seq); $i = $i + 3) {
        $codon = substr($seq,$i,3);
        $acid = $aa{$codon};
#       print("$codon = $acid\n");
        $protein .= $acid;
    }

    return($protein);
}
    
sub usage {

# -h = help
# -l = length .
# -d = DEFINITION .
# -a = ACCESSION .
# -N = NID 
# -o = ORGANISM .
# -c = Coding sequence - dna
# -C = Coding sequence - aa
# -i = intronic sequece
# -f = 5' sequence
# -t = 3' sequence
# -e = exon list
# -E = exon sequences
# -I = intron sequences
# -A = all sequence - dna

    print("Usage codnocod -hldaNocCfteEIA < genbankfile\n");
    print("
Reads genbank format sequence entries and outputs various features

          -h = help
          -l = length 
          -d = DEFINITION 
          -a = ACCESSION 
          -N = NID 
          -o = ORGANISM 
          -c = Coding sequence - dna
          -C = Coding sequence - aa
          -f = 5' sequence
          -t = 3' sequence
          -e = exon list
          -E = exon sequences
          -I = intron sequences
          -A = all sequence - dna
          -S = entries retrieved using SRS - no ORIGIN line and sequence is
               numbered on the right

");
    exit(0);
}

-- 
Dr Michele Clamp ,        EMBL-European Bioinformatics Institute 
Hinxton, Cambs CB10 1SD,  email : michele at ebi.ac.uk
UK                        Tel:  01223 494682    Fax: 01223 494468




More information about the Bio-soft mailing list

Send comments to us at biosci-help [At] net.bio.net