codon frequency tables
Martin Hilbers
hilbers at lionbio.co.uk
Fri Jun 18 11:07:27 EST 1999
Peter Rice wrote:
>
> proy at rsvs.ulaval.ca (Paul Roy) writes:
>
> > Dear GCGers:
> > Do you know of anyone who has a database of codon frequency tables in
> > GCG format? There was a large database of tables at a Japanese site (but
> > not in GCG format) and a smaller one (which our system lost when version
> > 10 was installed) which originally came from EGCG version 8.1. I don't
> > know if it would still be available at EBI. Or has anyone written a
> > program to reformat the Japanese database?
>
> The EGCG one included some files that were already generated by
> scientists at EMBL but mainly (the files with the 3 letter names) from
> the TRANSTERM database (maybe not an obvious place to look) now
> available from ftp://ftp.ebi.ac.uk/pub/databases/transterm/
>
> There is now a database generated from Genbank called CUTG which is
> indexed on various SRS servers. File cutg.dat is in GCG format but all
> in one file. SRS can extract the files for you, or you can use a
> simple script to write individual files. There was one available from
> http://www.seqnet.dl.ac.uk/srsdoc/getcutg but sadly SEQnet has moved
> so the URL does not work any longer. There are over 7000 codon usage
> tables in this database.
>
I have attached the getcutg perl script to this message.
Cheers,
Martin
--
-----------------------------------------------------
Martin Hilbers Customer Support Specialist
LION Bioscience Main: +44 (0) 1223 224 700
Sheraton House Phone: +44 (0) 1223 224 711
Castle Park Fax: +44 (0) 1223 224 701
Cambridge CB3 0AX
UNITED KINGDOM Email:hilbers at lionbio.co.uk
-----------------------------------------------------
-------------- next part --------------
#!/usr/bin/perl
#
# perl script to combine the cutg spsum files to a single flat file
# which contains all the codon usage tables in a gcg compatible format.
#
# usage: ./getcutg /data/cutg/*.spsum > /usr/local/srs/data/cutg.dat
#
# Martin Hilbers - 1996
#
# 5/11/97 - changed id from a number to a string based on the
# name of the species and the genbank section (eg homo_sapiens_pr)
#
#
$i=1;
foreach $file (@ARGV)
{
($section) = split(/\./,$file);
if ($section =~ /gbbct/) {$section ="Bacteria"; $sec="ba";}
if ($section =~ /gbinv/) {$section ="Invertebrates"; $sec="in";}
if ($section =~ /gbmam/) {$section ="Mammals"; $sec="om";}
if ($section =~ /gbphg/) {$section ="Phages"; $sec="ph";}
if ($section =~ /gbpln/) {$section ="Plants"; $sec="pl";}
if ($section =~ /gbpri/) {$section ="Primates"; $sec="pr";}
if ($section =~ /gbrod/) {$section ="Rodents"; $sec="ro";}
if ($section =~ /gbvrl/) {$section ="Viral"; $sec="vi";}
if ($section =~ /gbvrt/) {$section ="Vertebrates"; $sec="ov";}
open (FILE,$file);
#open (OUT,">out/$file");
while (<FILE>)
{
($species,$num) = split(/: /,$_);
$line = <FILE>;
$tot = 0;
$line = "0 $line";
@freq = split(/ /,$line);
$arg = $freq[1] + $freq[2] + $freq[3] + $freq[4] + $freq[5] + $freq[6];if ($arg==0) {$arg=1};
$leu = $freq[7] + $freq[8] + $freq[9] + $freq[10] + $freq[11] + $freq[12];if ($leu==0){$leu=1};
$ser = $freq[13] + $freq[14] + $freq[15] + $freq[16] + $freq[17] + $freq[18];if ($ser==0){$ser=1};
$thr = $freq[19] + $freq[20] + $freq[21] + $freq[22];if ($thr==0) {$thr=1};
$pro = $freq[23] + $freq[24] + $freq[25] + $freq[26];if ($pro==0) {$pro=1};
$ala = $freq[27] + $freq[28] + $freq[29] + $freq[30];if ($ala==0) {$ala=1};
$gly = $freq[31] + $freq[32] + $freq[33] + $freq[34];if ($gly==0) {$gly=1};
$val = $freq[35] + $freq[36] + $freq[37] + $freq[38];if ($val==0) {$val=1};
$lys = $freq[39] + $freq[40];if ($lys==0) {$lys=1};
$asn = $freq[41] + $freq[42];if ($asn==0) {$asn=1};
$gln = $freq[43] + $freq[44];if ($gln==0) {$gln=1};
$his = $freq[45] + $freq[46];if ($his==0) {$his=1};
$glu = $freq[47] + $freq[48];if ($glu==0) {$glu=1};
$asp = $freq[49] + $freq[50];if ($asp==0) {$asp=1};
$tyr = $freq[51] + $freq[52];if ($tyr==0) {$tyr=1};
$cys = $freq[53] + $freq[54];if ($cys==0) {$cys=1};
$phe = $freq[55] + $freq[56];if ($phe==0) {$phe=1};
$ile = $freq[57] + $freq[58] + $freq[59];if ($ile==0) {$ile=1};
$met = $freq[60];if ($met==0) {$met=1};
$trp = $freq[61];if ($trp==0) {$trp=1};
$end = $freq[62] + $freq[63] + $freq[64];if ($end==0) {$end=1};
foreach $val (@freq) {$tot+=$val;}
$name="$species $sec";
# $name=~s/(Mitochondrion) (.*)/$2 (Mit)/;
$name=~s/ /_/g;
$name=~s/\W//g;
print "CUTG ID: $name\n";
print "SPECIES: $species\n";
print "SECTION: $section\n";
print "SEQUENCES: $num";
print "CODONS: $tot\n";
$tot=$tot/1000;
$i++;
print "AmAcid Codon Number /1000 Fraction ..\n";
printf "\n";
printf "Gly GGG %11.2f %11.2f %11.2f\n",$freq[33],$freq[33]/$tot,$freq[33]/$gly;
printf "Gly GGA %11.2f %11.2f %11.2f\n",$freq[31],$freq[31]/$tot,$freq[31]/$gly;
printf "Gly GGT %11.2f %11.2f %11.2f\n",$freq[34],$freq[34]/$tot,$freq[34]/$gly;
printf "Gly GGC %11.2f %11.2f %11.2f\n",$freq[32],$freq[32]/$tot,$freq[32]/$gly;
printf "\n";
printf "Glu GAG %11.2f %11.2f %11.2f\n",$freq[48],$freq[48]/$tot,$freq[48]/$glu;
printf "Glu GAA %11.2f %11.2f %11.2f\n",$freq[47],$freq[47]/$tot,$freq[47]/$glu;
printf "Asp GAT %11.2f %11.2f %11.2f\n",$freq[50],$freq[50]/$tot,$freq[50]/$asp;
printf "Asp GAC %11.2f %11.2f %11.2f\n",$freq[49],$freq[49]/$tot,$freq[49]/$asp;
printf "\n";
printf "Val GTG %11.2f %11.2f %11.2f\n",$freq[37],$freq[37]/$tot,$freq[37]/$val;
printf "Val GTA %11.2f %11.2f %11.2f\n",$freq[35],$freq[35]/$tot,$freq[35]/$val;
printf "Val GTT %11.2f %11.2f %11.2f\n",$freq[38],$freq[38]/$tot,$freq[38]/$val;
printf "Val GTC %11.2f %11.2f %11.2f\n",$freq[36],$freq[36]/$tot,$freq[36]/$val;
printf "\n";
printf "Ala GCG %11.2f %11.2f %11.2f\n",$freq[29],$freq[29]/$tot,$freq[29]/$ala;
printf "Ala GCA %11.2f %11.2f %11.2f\n",$freq[27],$freq[27]/$tot,$freq[27]/$ala;
printf "Ala GCT %11.2f %11.2f %11.2f\n",$freq[30],$freq[30]/$tot,$freq[30]/$ala;
printf "Ala GCC %11.2f %11.2f %11.2f\n",$freq[28],$freq[28]/$tot,$freq[28]/$ala;
printf "\n";
printf "Arg AGG %11.2f %11.2f %11.2f\n",$freq[6],$freq[6]/$tot,$freq[6]/$arg;
printf "Arg AGA %11.2f %11.2f %11.2f\n",$freq[5],$freq[5]/$tot,$freq[5]/$arg;
printf "Ser AGT %11.2f %11.2f %11.2f\n",$freq[18],$freq[18]/$tot,$freq[18]/$ser;
printf "Ser AGC %11.2f %11.2f %11.2f\n",$freq[17],$freq[17]/$tot,$freq[17]/$ser;
printf "\n";
printf "Lys AAG %11.2f %11.2f %11.2f\n",$freq[40],$freq[40]/$tot,$freq[40]/$lys;
printf "Lys AAA %11.2f %11.2f %11.2f\n",$freq[39],$freq[39]/$tot,$freq[39]/$lys;
printf "Asn AAT %11.2f %11.2f %11.2f\n",$freq[42],$freq[42]/$tot,$freq[42]/$asn;
printf "Asn AAC %11.2f %11.2f %11.2f\n",$freq[41],$freq[41]/$tot,$freq[41]/$asn;
printf "\n";
printf "Met ATG %11.2f %11.2f %11.2f\n",$freq[60],$freq[60]/$tot,$freq[60]/$met;
printf "Ile ATA %11.2f %11.2f %11.2f\n",$freq[57],$freq[57]/$tot,$freq[57]/$ile;
printf "Ile ATT %11.2f %11.2f %11.2f\n",$freq[59],$freq[59]/$tot,$freq[59]/$ile;
printf "Ile ATC %11.2f %11.2f %11.2f\n",$freq[58],$freq[58]/$tot,$freq[58]/$ile;
printf "\n";
printf "Thr ACG %11.2f %11.2f %11.2f\n",$freq[21],$freq[21]/$tot,$freq[21]/$thr;
printf "Thr ACA %11.2f %11.2f %11.2f\n",$freq[19],$freq[19]/$tot,$freq[19]/$thr;
printf "Thr ACT %11.2f %11.2f %11.2f\n",$freq[22],$freq[22]/$tot,$freq[22]/$thr;
printf "Thr ACC %11.2f %11.2f %11.2f\n",$freq[20],$freq[20]/$tot,$freq[20]/$thr;
printf "\n";
printf "Trp TGG %11.2f %11.2f %11.2f\n",$freq[61],$freq[61]/$tot,$freq[61]/$trp;
printf "End TGA %11.2f %11.2f %11.2f\n",$freq[64],$freq[64]/$tot,$freq[64]/$end;
printf "Cys TGT %11.2f %11.2f %11.2f\n",$freq[54],$freq[54]/$tot,$freq[54]/$cys;
printf "Cys TGC %11.2f %11.2f %11.2f\n",$freq[53],$freq[53]/$tot,$freq[53]/$cys;
printf "\n";
printf "End TAG %11.2f %11.2f %11.2f\n",$freq[63],$freq[63]/$tot,$freq[63]/$end;
printf "End TAA %11.2f %11.2f %11.2f\n",$freq[62],$freq[62]/$tot,$freq[62]/$end;
printf "Tyr TAT %11.2f %11.2f %11.2f\n",$freq[52],$freq[52]/$tot,$freq[52]/$tyr;
printf "Tyr TAC %11.2f %11.2f %11.2f\n",$freq[51],$freq[51]/$tot,$freq[51]/$tyr;
printf "\n";
printf "Leu TTG %11.2f %11.2f %11.2f\n",$freq[12],$freq[12]/$tot,$freq[12]/$leu;
printf "Leu TTA %11.2f %11.2f %11.2f\n",$freq[11],$freq[11]/$tot,$freq[11]/$leu;
printf "Phe TTT %11.2f %11.2f %11.2f\n",$freq[56],$freq[56]/$tot,$freq[56]/$phe;
printf "Phe TTC %11.2f %11.2f %11.2f\n",$freq[55],$freq[55]/$tot,$freq[55]/$phe;
printf "\n";
printf "Ser TCG %11.2f %11.2f %11.2f\n",$freq[15],$freq[15]/$tot,$freq[15]/$ser;
printf "Ser TCA %11.2f %11.2f %11.2f\n",$freq[13],$freq[13]/$tot,$freq[13]/$ser;
printf "Ser TCT %11.2f %11.2f %11.2f\n",$freq[16],$freq[16]/$tot,$freq[16]/$ser;
printf "Ser TCC %11.2f %11.2f %11.2f\n",$freq[14],$freq[14]/$tot,$freq[14]/$ser;
printf "\n";
printf "Arg CGG %11.2f %11.2f %11.2f\n",$freq[3],$freq[3]/$tot,$freq[3]/$arg;
printf "Arg CGA %11.2f %11.2f %11.2f\n",$freq[1],$freq[1]/$tot,$freq[1]/$arg;
printf "Arg CGT %11.2f %11.2f %11.2f\n",$freq[4],$freq[4]/$tot,$freq[4]/$arg;
printf "Arg CGC %11.2f %11.2f %11.2f\n",$freq[2],$freq[2]/$tot,$freq[2]/$arg;
printf "\n";
printf "Gln CAG %11.2f %11.2f %11.2f\n",$freq[44],$freq[44]/$tot,$freq[44]/$gln;
printf "Gln CAA %11.2f %11.2f %11.2f\n",$freq[43],$freq[43]/$tot,$freq[43]/$gln;
printf "His CAT %11.2f %11.2f %11.2f\n",$freq[46],$freq[46]/$tot,$freq[46]/$his;
printf "His CAC %11.2f %11.2f %11.2f\n",$freq[45],$freq[45]/$tot,$freq[45]/$his;
printf "\n";
printf "Leu CTG %11.2f %11.2f %11.2f\n",$freq[9],$freq[9]/$tot,$freq[9]/$leu;
printf "Leu CTA %11.2f %11.2f %11.2f\n",$freq[7],$freq[7]/$tot,$freq[7]/$leu;
printf "Leu CTT %11.2f %11.2f %11.2f\n",$freq[10],$freq[10]/$tot,$freq[10]/$leu;
printf "Leu CTC %11.2f %11.2f %11.2f\n",$freq[8],$freq[8]/$tot,$freq[8]/$leu;
printf "\n";
printf "Pro CCG %11.2f %11.2f %11.2f\n",$freq[25],$freq[25]/$tot,$freq[25]/$pro;
printf "Pro CCA %11.2f %11.2f %11.2f\n",$freq[23],$freq[23]/$tot,$freq[23]/$pro;
printf "Pro CCT %11.2f %11.2f %11.2f\n",$freq[26],$freq[26]/$tot,$freq[26]/$pro;
printf "Pro CCC %11.2f %11.2f %11.2f\n",$freq[24],$freq[24]/$tot,$freq[24]/$pro;
}
}
More information about the Info-gcg
mailing list
Send comments to us at biosci-help [At] net.bio.net