Some time ago I saw a person asking for a program to predict PI of a peptide.
Here is some code which does that.
--
&
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!S!!!!!!!!!!!
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!S!!!!!!!S!!!!!!!!!!!
| [Ca++] | [##|##] <-|
Sergey "The Shmul" Levin | |_ _ V _ |
Albert Einstein College Of Medicine | PFUS->###s_/m\_P_/m\ |
Department of Anatomy and Structural Biology | ###< \_/ \_/ |
1300 Morris Park Avenue, Bronx, New York 10461| #### |
Ulmann building 905 | phosphodiesterase
phone (718) 430 4064 | Parafusin - a member of PGM
e-mail slevin at telico.bioc.aecom.yu.edu | superfamily
=============================================================================
-------------- next part --------------
/***********************************************************************/
/* I just thought that there might be a use for something like this */
/* among protein people. I presonally used it to analyze transmembrane */
/* protein sequences. Programm contents and use is described below */
/* */
/* PROGRAMM: predict_pi <file.seq> */
/* */
/* FILE format: 1 letter amino acid code, all in 1 line */
/* */
/* FUNCTION: predicts overall theoretical pI of a protein sequence */
/* using a really dumb algorythm described below. Also, using this */
/* algorythm, it produces a file with a graph of floating frame of */
/* 10 AA pI values ( pI of AAi to AAi+10), which will be used for */
/* allocation of imporatnt streches in the protein. */
/* */
/* ALGORYTHM: we calculate the summary charge of the peptide by adding */
/* a fractional charge for each AA and recieve a "theoretical */
/* titration curve. Afterwords we solve the root of the function by */
/* bisection. If you can figure out a better way to solve this */
/* function, do not hesitate to chage it. */
/* */
/* COMMENTS: This might also be usefull for aligning distantly-related */
/* protein sequences and possibly, fishing out folds from the PDB. */
/* I will check on that further on. Also, sometimes it is nice to know */
/* the overall theoretical PI of the protein. I have used it to find */
/* stabilizing "post"-membrane loops in a Ca++ chanell sequence. */
/* */
/* USE: Compile (on anything you want: ANSI C). Give name of sequence */
/* file at the command line. */
/* */
/* AUTHORS AND RIGHTS: This programm is a part of a POLINA package */
/* POLINA = Protein Oriented LINear Analysis . The entire package will */
/* be out later (let me debug it and polish some things. Program */
/* written by Sergey Levin, Copyleft 1995 Me at PM :). */
/* I DO NOT!!!! */
/* CARE WHAT YOU USE IT FOR, PUBLISH AND MODIFY ALL YOU WANT :). */
/* just write me a quick e-mail if you find any use for this programm.*/
/* I am just curious. email: slevin at telico.bioc.aecom.yu.edu */
/***********************************************************************/
#include <stdio.h>
#include <math.h>
#define ITERATIONS 3000
#define AMINO_ACIDS 24
/* the 3 letter code Gly Ala Val Leu Ile Met Cys Ser Thr Asn Gln Asp Glu Lys Arg His Phe Tyr Trp Pro del*/
char aminos[AMINO_ACIDS]="gavlimcstnqdekrhfywp- ";
/* the 1 letter code g a v l i m c s t n q d e k r h f y w p NT CT */
int charge_table[AMINO_ACIDS]={0, 0, 0, 0, 0, 0,-1, 0, 0, 0, 0,-1,-1,+1,+1,+1, 0,-1, 0, 0, +1, -1 };
/* the 1 letter code g a v l i m c s t n q d e k r h f y w p NT CT */
float pK_table[AMINO_ACIDS]={0, 0, 0, 0, 0, 0, 8.3, 0, 0, 0, 0, 3.9, 4.1, 10.7, 12.5, 6.0, 0, 10.1, 0, 0, 9.9, 2.9 };
int sequence[10000];
int end;
int amino2int(char);
float pI_peptide(int protein[10000], int n );
void titr_curve();
/*******************************************************/
/*Main file: parcing of command line and function calls*/
/*******************************************************/
void main(int argc, char *argv[])
{
if(argc<2)
{
printf("\nPOLINA> PREDICT_PI: Error. Use: [name]<p> [OUT]\n");
exit(0);
}
read_in(argv[1]);
printf("\n POLINA> PREDICT_PI: Protein %s\n", argv[1]);
printf("\n POLINA> PREDICT_PI: Overall pI=%f\n", pI_peptide(sequence,
end));
pI_frame( 9, "pi.out");
printf("\nPOLINA> PREDICT_PI: Wrote pI results to %c\n",argv[4]);
}
/******************************************************/
/* Read in the protein sequence from a file given on */
/* command line. There should be NO OTHER CHARACTERS */
/* in the sequence file. Modify function as you wish */
/* to accomodate different formats of sequence files. */
/******************************************************/
read_in(char *filename)
{
FILE *f;
char ch;
int i, temp;
if((f=fopen(filename,"r"))==NULL)
{
printf("\nPOLINA> PREDICT_PI: Error opening file: %s\n",filename);
exit(1);
}
while(ch!='\n' && !feof(f))
{
fscanf(f, "%c", &ch);
sequence[i]= amino2int(ch);
i++;
}
end=i-2;
fclose(f);
}
/*****************************************************/
/*Converts aminoacid letter code into numerical value*/
/*using the given table. slevin, 1994 */
/*****************************************************/
int amino2int(char amino)
{
int i;
for(i=0; i<=24; i++)
{
if(amino==aminos[i])
return i;
if(amino=='\n')
return 24;
}
fprintf(stderr,"\nPOLINA> PREDICT_PI: Amino acid '%c'does not exist" , amino);
return 24;
}
/**********************************************************/
/* Floating frame of pI of a protein. */
/**********************************************************/
void pI_frame(int frame, char *filename)
{
FILE *f;
int temp[10];
int i, j, k;
f=fopen(filename,"w");
for(i=0; i+frame<=end; i++)
{
for(j=0; j<=frame; j++)
temp[j]=sequence[i+j];
fprintf( f, "%i , %f\n",i, pI_peptide(temp, frame ));
}
fclose(f);
}
/******************************************************/
/*The basic formula for charge of a protein is derived*/
/*from Henderixon-Hasselbach equation */
/* */
/* pH = pK + log([base]/[acid]) */
/* */
/* and fractional charge Q=1/(1+[base]/[acid]) */
/* */
/* Charge = Summ(Charge of sidechain*1/1+10^q(ph+pK)) */
/* */
/******************************************************/
float protein_charge(float pH, int protein[], int n)
{
float charge=0;
int i;
float base_acid;
for(i=0; i<=n; i++)
{
if(i==0)
{
base_acid=powf(10, (pH - pK_table[23]));
charge += 1 / (1+base_acid);
}
if(i==n)
{
base_acid=powf(10, -(pH - pK_table[24]));
charge += -1 / (1+base_acid);
}
if( charge_table[protein[i]] == +1 )
{
base_acid=powf(10, (pH - pK_table[protein[i]]));
charge += 1 / (1+base_acid);
}
if( charge_table[protein[i]] == -1 )
{
base_acid=powf(10, -(pH - pK_table[protein[i]]));
charge += -1 / (1+base_acid);
}
else charge+=0;
}
return charge;
}
/*************************************************************/
/*This is a bisection method for finding roots to a function */
/*notice that here function is RIGHT AWAY the "protein_charge*/
/*function. Root returned is pI of the given sequence */
/* If you are not good at numerical analysis, I would advise */
/* you not to modify this function */
/*************************************************************/
float pI_peptide( int protein[1000], int n)
{
int i;
float dx, fu, fmid, xmid, root;
float brack1=1.5,brack2=13.5, xacc=0.0001;
fu=protein_charge(brack1, sequence, n);
fmid=protein_charge(brack2, sequence, n);
root = fu < 0.0 ? (dx=brack2-brack1,brack1) :
(dx=brack1-brack2,brack2);
for (i=1; i<=ITERATIONS; i++)
{
fmid = protein_charge(xmid=root+(dx *= 0.5),protein, n);
if (fmid <= 0.0)
root=xmid;
if (fabs(dx) < xacc || fmid == 0.0)
return root;
}
}
/*************************************************************************/
/*This functions outputs titration curve for the protein. Is not used in */
/* main. I used it for my presentations */
/*************************************************************************/
void titr_curve()
{
float i=0, I;
while(i<13.5)
{
protein_charge(i, sequence, end);
i+=0.1;
}
}