codon frequency tables

Martin Hilbers hilbers at lionbio.co.uk
Fri Jun 18 11:07:27 EST 1999

Peter Rice wrote:
> proy at rsvs.ulaval.ca (Paul Roy) writes:
> > Dear GCGers:
> >      Do you know of anyone who has a database of codon frequency tables in
> > GCG format?  There was a large database of tables at a Japanese site (but
> > not in GCG format) and a smaller one (which our system lost when version
> > 10 was installed) which originally came from EGCG version 8.1.  I don't
> > know if it would still be available at EBI.  Or has anyone written a
> > program to reformat the Japanese database?
> The EGCG one included some files that were already generated by
> scientists at EMBL but mainly (the files with the 3 letter names) from
> the TRANSTERM database (maybe not an obvious place to look) now
> available from ftp://ftp.ebi.ac.uk/pub/databases/transterm/
> There is now a database generated from Genbank called CUTG which is
> indexed on various SRS servers. File cutg.dat is in GCG format but all
> in one file. SRS can extract the files for you, or you can use a
> simple script to write individual files. There was one available from
> http://www.seqnet.dl.ac.uk/srsdoc/getcutg but sadly SEQnet has moved
> so the URL does not work any longer. There are over 7000 codon usage
> tables in this database.

I have attached the getcutg perl script to this message.


Martin Hilbers            Customer Support Specialist
LION Bioscience           Main:  +44 (0) 1223 224 700
Sheraton House            Phone: +44 (0) 1223 224 711
Castle Park               Fax:   +44 (0) 1223 224 701
Cambridge CB3 0AX        
UNITED KINGDOM            Email:hilbers at lionbio.co.uk
-------------- next part --------------
# perl script to combine the cutg spsum files to a single flat file
# which contains all the codon usage tables in a gcg compatible format.
# usage: ./getcutg /data/cutg/*.spsum  > /usr/local/srs/data/cutg.dat
#                                                Martin Hilbers - 1996
# 5/11/97 - changed id from a number to a string based on the
# name of the species and the genbank section (eg homo_sapiens_pr)

foreach $file (@ARGV)
($section) = split(/\./,$file);
if ($section =~ /gbbct/) {$section ="Bacteria"; $sec="ba";}
if ($section =~ /gbinv/) {$section ="Invertebrates"; $sec="in";}
if ($section =~ /gbmam/) {$section ="Mammals"; $sec="om";}
if ($section =~ /gbphg/) {$section ="Phages"; $sec="ph";}
if ($section =~ /gbpln/) {$section ="Plants"; $sec="pl";}
if ($section =~ /gbpri/) {$section ="Primates"; $sec="pr";}
if ($section =~ /gbrod/) {$section ="Rodents"; $sec="ro";}
if ($section =~ /gbvrl/) {$section ="Viral"; $sec="vi";}
if ($section =~ /gbvrt/) {$section ="Vertebrates"; $sec="ov";}
open (FILE,$file);
#open (OUT,">out/$file");
while (<FILE>)
    ($species,$num) = split(/: /,$_);
    $line = <FILE>;
    $tot = 0;
    $line = "0 $line";
    @freq = split(/ /,$line);
    $arg = $freq[1]  + $freq[2]  + $freq[3]  + $freq[4]  + $freq[5]  + $freq[6];if ($arg==0) {$arg=1};
    $leu = $freq[7]  + $freq[8]  + $freq[9]  + $freq[10] + $freq[11] + $freq[12];if ($leu==0){$leu=1};
    $ser = $freq[13] + $freq[14] + $freq[15] + $freq[16] + $freq[17] + $freq[18];if ($ser==0){$ser=1};
    $thr = $freq[19] + $freq[20] + $freq[21] + $freq[22];if ($thr==0) {$thr=1};
    $pro = $freq[23] + $freq[24] + $freq[25] + $freq[26];if ($pro==0) {$pro=1};
    $ala = $freq[27] + $freq[28] + $freq[29] + $freq[30];if ($ala==0) {$ala=1};
    $gly = $freq[31] + $freq[32] + $freq[33] + $freq[34];if ($gly==0) {$gly=1};
    $val = $freq[35] + $freq[36] + $freq[37] + $freq[38];if ($val==0) {$val=1};
    $lys = $freq[39] + $freq[40];if ($lys==0) {$lys=1};
    $asn = $freq[41] + $freq[42];if ($asn==0) {$asn=1};
    $gln = $freq[43] + $freq[44];if ($gln==0) {$gln=1};
    $his = $freq[45] + $freq[46];if ($his==0) {$his=1};
    $glu = $freq[47] + $freq[48];if ($glu==0) {$glu=1};
    $asp = $freq[49] + $freq[50];if ($asp==0) {$asp=1};
    $tyr = $freq[51] + $freq[52];if ($tyr==0) {$tyr=1};
    $cys = $freq[53] + $freq[54];if ($cys==0) {$cys=1};
    $phe = $freq[55] + $freq[56];if ($phe==0) {$phe=1};
    $ile = $freq[57] + $freq[58] + $freq[59];if ($ile==0) {$ile=1};
    $met = $freq[60];if ($met==0) {$met=1};
    $trp = $freq[61];if ($trp==0) {$trp=1};
    $end = $freq[62] + $freq[63] + $freq[64];if ($end==0) {$end=1};
    foreach $val (@freq) {$tot+=$val;}    
    $name="$species $sec";
#    $name=~s/(Mitochondrion) (.*)/$2 (Mit)/;
    $name=~s/ /_/g;
    print "CUTG ID:   $name\n";
    print "SPECIES:   $species\n";
    print "SECTION:   $section\n";
    print "SEQUENCES: $num";
    print "CODONS:    $tot\n";
    print "AmAcid  Codon    Number       /1000    Fraction   ..\n";
    printf "\n"; 
    printf "Gly     GGG %11.2f %11.2f %11.2f\n",$freq[33],$freq[33]/$tot,$freq[33]/$gly;
    printf "Gly     GGA %11.2f %11.2f %11.2f\n",$freq[31],$freq[31]/$tot,$freq[31]/$gly;    
    printf "Gly     GGT %11.2f %11.2f %11.2f\n",$freq[34],$freq[34]/$tot,$freq[34]/$gly;     
    printf "Gly     GGC %11.2f %11.2f %11.2f\n",$freq[32],$freq[32]/$tot,$freq[32]/$gly;   
    printf "\n"; 
    printf "Glu     GAG %11.2f %11.2f %11.2f\n",$freq[48],$freq[48]/$tot,$freq[48]/$glu; 
    printf "Glu     GAA %11.2f %11.2f %11.2f\n",$freq[47],$freq[47]/$tot,$freq[47]/$glu;    
    printf "Asp     GAT %11.2f %11.2f %11.2f\n",$freq[50],$freq[50]/$tot,$freq[50]/$asp;    
    printf "Asp     GAC %11.2f %11.2f %11.2f\n",$freq[49],$freq[49]/$tot,$freq[49]/$asp;    
    printf "\n"; 
    printf "Val     GTG %11.2f %11.2f %11.2f\n",$freq[37],$freq[37]/$tot,$freq[37]/$val;    
    printf "Val     GTA %11.2f %11.2f %11.2f\n",$freq[35],$freq[35]/$tot,$freq[35]/$val;     
    printf "Val     GTT %11.2f %11.2f %11.2f\n",$freq[38],$freq[38]/$tot,$freq[38]/$val;     
    printf "Val     GTC %11.2f %11.2f %11.2f\n",$freq[36],$freq[36]/$tot,$freq[36]/$val;   
    printf "\n";
    printf "Ala     GCG %11.2f %11.2f %11.2f\n",$freq[29],$freq[29]/$tot,$freq[29]/$ala;    
    printf "Ala     GCA %11.2f %11.2f %11.2f\n",$freq[27],$freq[27]/$tot,$freq[27]/$ala;   
    printf "Ala     GCT %11.2f %11.2f %11.2f\n",$freq[30],$freq[30]/$tot,$freq[30]/$ala;    
    printf "Ala     GCC %11.2f %11.2f %11.2f\n",$freq[28],$freq[28]/$tot,$freq[28]/$ala;    
    printf "\n";
    printf "Arg     AGG %11.2f %11.2f %11.2f\n",$freq[6],$freq[6]/$tot,$freq[6]/$arg;   
    printf "Arg     AGA %11.2f %11.2f %11.2f\n",$freq[5],$freq[5]/$tot,$freq[5]/$arg;    
    printf "Ser     AGT %11.2f %11.2f %11.2f\n",$freq[18],$freq[18]/$tot,$freq[18]/$ser;    
    printf "Ser     AGC %11.2f %11.2f %11.2f\n",$freq[17],$freq[17]/$tot,$freq[17]/$ser;    
    printf "\n"; 
    printf "Lys     AAG %11.2f %11.2f %11.2f\n",$freq[40],$freq[40]/$tot,$freq[40]/$lys;    
    printf "Lys     AAA %11.2f %11.2f %11.2f\n",$freq[39],$freq[39]/$tot,$freq[39]/$lys;    
    printf "Asn     AAT %11.2f %11.2f %11.2f\n",$freq[42],$freq[42]/$tot,$freq[42]/$asn;     
    printf "Asn     AAC %11.2f %11.2f %11.2f\n",$freq[41],$freq[41]/$tot,$freq[41]/$asn;    
    printf "\n"; 
    printf "Met     ATG %11.2f %11.2f %11.2f\n",$freq[60],$freq[60]/$tot,$freq[60]/$met;     
    printf "Ile     ATA %11.2f %11.2f %11.2f\n",$freq[57],$freq[57]/$tot,$freq[57]/$ile;   
    printf "Ile     ATT %11.2f %11.2f %11.2f\n",$freq[59],$freq[59]/$tot,$freq[59]/$ile;   
    printf "Ile     ATC %11.2f %11.2f %11.2f\n",$freq[58],$freq[58]/$tot,$freq[58]/$ile;   
    printf "\n"; 
    printf "Thr     ACG %11.2f %11.2f %11.2f\n",$freq[21],$freq[21]/$tot,$freq[21]/$thr;    
    printf "Thr     ACA %11.2f %11.2f %11.2f\n",$freq[19],$freq[19]/$tot,$freq[19]/$thr;    
    printf "Thr     ACT %11.2f %11.2f %11.2f\n",$freq[22],$freq[22]/$tot,$freq[22]/$thr;   
    printf "Thr     ACC %11.2f %11.2f %11.2f\n",$freq[20],$freq[20]/$tot,$freq[20]/$thr;   
    printf "\n"; 
    printf "Trp     TGG %11.2f %11.2f %11.2f\n",$freq[61],$freq[61]/$tot,$freq[61]/$trp;  
    printf "End     TGA %11.2f %11.2f %11.2f\n",$freq[64],$freq[64]/$tot,$freq[64]/$end;    
    printf "Cys     TGT %11.2f %11.2f %11.2f\n",$freq[54],$freq[54]/$tot,$freq[54]/$cys;   
    printf "Cys     TGC %11.2f %11.2f %11.2f\n",$freq[53],$freq[53]/$tot,$freq[53]/$cys;  
    printf "\n";
    printf "End     TAG %11.2f %11.2f %11.2f\n",$freq[63],$freq[63]/$tot,$freq[63]/$end;     
    printf "End     TAA %11.2f %11.2f %11.2f\n",$freq[62],$freq[62]/$tot,$freq[62]/$end;    
    printf "Tyr     TAT %11.2f %11.2f %11.2f\n",$freq[52],$freq[52]/$tot,$freq[52]/$tyr;    
    printf "Tyr     TAC %11.2f %11.2f %11.2f\n",$freq[51],$freq[51]/$tot,$freq[51]/$tyr;   
    printf "\n"; 
    printf "Leu     TTG %11.2f %11.2f %11.2f\n",$freq[12],$freq[12]/$tot,$freq[12]/$leu;    
    printf "Leu     TTA %11.2f %11.2f %11.2f\n",$freq[11],$freq[11]/$tot,$freq[11]/$leu;    
    printf "Phe     TTT %11.2f %11.2f %11.2f\n",$freq[56],$freq[56]/$tot,$freq[56]/$phe;   
    printf "Phe     TTC %11.2f %11.2f %11.2f\n",$freq[55],$freq[55]/$tot,$freq[55]/$phe;   
    printf "\n"; 
    printf "Ser     TCG %11.2f %11.2f %11.2f\n",$freq[15],$freq[15]/$tot,$freq[15]/$ser;    
    printf "Ser     TCA %11.2f %11.2f %11.2f\n",$freq[13],$freq[13]/$tot,$freq[13]/$ser;    
    printf "Ser     TCT %11.2f %11.2f %11.2f\n",$freq[16],$freq[16]/$tot,$freq[16]/$ser;    
    printf "Ser     TCC %11.2f %11.2f %11.2f\n",$freq[14],$freq[14]/$tot,$freq[14]/$ser;    
    printf "\n"; 
    printf "Arg     CGG %11.2f %11.2f %11.2f\n",$freq[3],$freq[3]/$tot,$freq[3]/$arg;       
    printf "Arg     CGA %11.2f %11.2f %11.2f\n",$freq[1],$freq[1]/$tot,$freq[1]/$arg;    
    printf "Arg     CGT %11.2f %11.2f %11.2f\n",$freq[4],$freq[4]/$tot,$freq[4]/$arg;    
    printf "Arg     CGC %11.2f %11.2f %11.2f\n",$freq[2],$freq[2]/$tot,$freq[2]/$arg;    
    printf "\n"; 
    printf "Gln     CAG %11.2f %11.2f %11.2f\n",$freq[44],$freq[44]/$tot,$freq[44]/$gln;   
    printf "Gln     CAA %11.2f %11.2f %11.2f\n",$freq[43],$freq[43]/$tot,$freq[43]/$gln;   
    printf "His     CAT %11.2f %11.2f %11.2f\n",$freq[46],$freq[46]/$tot,$freq[46]/$his;    
    printf "His     CAC %11.2f %11.2f %11.2f\n",$freq[45],$freq[45]/$tot,$freq[45]/$his;     
    printf "\n"; 
    printf "Leu     CTG %11.2f %11.2f %11.2f\n",$freq[9],$freq[9]/$tot,$freq[9]/$leu;   
    printf "Leu     CTA %11.2f %11.2f %11.2f\n",$freq[7],$freq[7]/$tot,$freq[7]/$leu;   
    printf "Leu     CTT %11.2f %11.2f %11.2f\n",$freq[10],$freq[10]/$tot,$freq[10]/$leu;    
    printf "Leu     CTC %11.2f %11.2f %11.2f\n",$freq[8],$freq[8]/$tot,$freq[8]/$leu;    
    printf "\n"; 
    printf "Pro     CCG %11.2f %11.2f %11.2f\n",$freq[25],$freq[25]/$tot,$freq[25]/$pro;    
    printf "Pro     CCA %11.2f %11.2f %11.2f\n",$freq[23],$freq[23]/$tot,$freq[23]/$pro;    
    printf "Pro     CCT %11.2f %11.2f %11.2f\n",$freq[26],$freq[26]/$tot,$freq[26]/$pro;    
    printf "Pro     CCC %11.2f %11.2f %11.2f\n",$freq[24],$freq[24]/$tot,$freq[24]/$pro;   

More information about the Info-gcg mailing list

Send comments to us at biosci-help [At] net.bio.net