> Does anybody know of a program to split up a large DNA-sequence file
> (4 Mb) into smaller files/sequences of 200 kb with 10 kb overlap?
You can use the attached perl script. Calling syntax is
split_seq [-nFragsize -oOverlap] infile
the result is written to StdOut. Default are chunks of 5000 bp
with a 1000 bp overlap
As input formats, FASTA, GCG and EMBL/Swissprot are recognized
Best regards,
Kay
===========================================================
Kay Hofmann, PhD Tel: +49 (221) 950 4814
Bioinformatics Group FAX: +49 (221) 950 4848
MEMOREC Stoffel GmbH
Stoeckheimer Weg 1
D50829 Koeln/Germany E-mail: Kay.Hofmann at memorec.com
-------------- next part --------------
#!/usr/local/bin/perl
#
#
# This program splits a big sequence file into smaller parts
# fragment size and overlap can be given on the command line
#
#
# The call syntax is:
# split_seq [-nFragsize -oOverlap] big_sequence
#
#
#
$pipemode='';
$outformat='FASTA';
$split=5000;
$overlap=1000;
##################################################################
# get sequence in any format and put it into Seq, SeqID and SeqDE
##################################################################
sub getseq {
if ($format eq "FASTA") {
&getfasta;
$Seq = $fastabuffer;
$SeqID = $FastaID;
$SeqDE = $FastaDE;
}
elsif ($format eq "EMBL") {
&getembl;
$Seq = $emblbuffer;
$SeqID = $EmblID;
$SeqDE = $EmblDE;
}
else {
&getgcg;
$Seq = $gcgbuffer;
$SeqID = $gcgid;
$SeqDE = $gcgde;
}
}
###################################################################
# read Pearson/FASTA format sequence (not to be called externally)
###################################################################
sub getfasta {
$fastabuffer="";
$FastaID="";
$FastaDE="";
$line="";
until (($fastaline =~ /^\>/) || eof(SEQFILE)) {$fastaline=<SEQFILE>};
if ($fastaline=~/^\>(\S+)\s(.*)$/) {
$FastaID=$1;
$FastaDE=$2;
}
while ($FastaID=~/^(.*\|)(.+)$/) { $FastaID=$2}
until (($line =~ /^\>/) || eof(SEQFILE)) {
$line=<SEQFILE>;
if (!($line =~ /^\>/)) {$fastabuffer .= $line}
}
if ($line =~ /^\>/) {$fastaline=$line}
else {$fastaline=""};
$fastabuffer =~ tr/a-z/A-Z/;
$fastabuffer =~ s/[^A-Z]//g;
}
###################################################################
# read EMBL/Swissprot format sequence (not to be called externally)
###################################################################
sub getembl {
$emblbuffer="";
$EmblID="";
$EmblDE="";
$line="";
until (($line =~ /^ID\s/) || eof(SEQFILE)) {$line=<SEQFILE>};
if ($line=~/^ID\s+(\w+).*$/) {$EmblID=$1;}
until (($line =~ /^SQ\s/) || eof(SEQFILE)) {
$line=<SEQFILE>;
if ($line =~ /^DE\s+(.*)/) {
if($EmblDE) {$EmblDE.=" "};
$EmblDE .= $1
}
}
if ($line =~ /^SQ\s/) {
until (($line =~ /^\/\//) || eof(SEQFILE)) {
$line=<SEQFILE>;
if (!($line =~ /^\/\//)) {$emblbuffer .= $line}
}
}
$emblbuffer =~ tr/a-z/A-Z/;
$emblbuffer =~ s/[^A-Z]//g;
}
###################################################################
# read GCG format sequence (not to be called externally)
###################################################################
sub getgcg {
$gcgbuffer="";
$gcgid="";
$line="";
until (($line =~ /\.\./) || eof(SEQFILE)) {$line=<SEQFILE>};
if ($line=~/^(\w*).*\.\./) {$gcgid=$1;}
until (eof(SEQFILE)) {
$line=<SEQFILE>;
$gcgbuffer .= $line
}
$gcgbuffer =~ tr/a-z/A-Z/;
$gcgbuffer =~ s/[^A-Z]//g;
}
###################################################################
# test command line arguments and open files:
###################################################################
while ($ARGV[0]=~/^-/) {
$agm = shift(@ARGV);
if($agm=~/n(\d+)/) {
$split=$1
};
if($agm=~/o(\d+)/) {
$overlap=$1
};
}
if($#ARGV!=0) {
print "The call syntax is:\n\n";
print "split_seq [-nFragsize -oOverlap] infile\n\n";
die "\n";
}
unless (-T $ARGV[0]) {die "$ARGV[0] no valid input file\n"};
open(SEQFILE,"$ARGV[0]") || die "can't open $ARGV[1]: $!\n";
###################################################################
# determine sequence file format
###################################################################
$line=<SEQFILE>;
if ($line =~ /^\>/) {$format="FASTA"}
elsif ($line =~ /^ID\s/) {$format="EMBL"}
else {
while (!($line =~ /\.\./) && ($line=<SEQFILE>)) {}
if ($line=~/\.\./) {$format="GCG"}
else {die "Sequence format not recognized\n"}
}
close(SEQFILE);
open(SEQFILE,"$ARGV[0]") || die "can't reopen $ARGV[0]: $!\n";
###################################################################
# the main program
###################################################################
do {
&getseq;
$seqlen = length($Seq);
$count=1;
$pos1=1;
while (length($Seq)>$split) {
$pos2=$pos1+$split-1;
print ">$SeqID","_$count $pos1 to $pos2 $SeqDE\n";
$part = substr($Seq,0,$split);
$Seq = substr($Seq,$split-$overlap);
while (length($part) > 60) {
$line = substr($part,0,60);
$part = substr($part,60);
print "$line\n";
}
print "$part\n";
$count=$count+1;
$pos1=$pos1+$split-$overlap;
}
$part = $Seq;
$pos2=$pos1+length($part)-1;
print ">$SeqID","_$count $pos1 to $pos2 $SeqDE\n";
while (length($part) > 60) {
$line = substr($part,0,60);
$part = substr($part,60);
print "$line\n";
}
print "$part\n";
}
until eof(SEQFILE);