kappe at caos.kun.nl (kappe) writes:
> Does anybody know of a program to split up a large DNA-sequence file
> (4 Mb) into smaller files/sequences of 200 kb with 10 kb overlap?
STADEN has a program called splitseq
or if you have tcl8.x installed you can use the atteched tcl script
save it as splitseq.tcl
on a UNIX machine use it as:
seqsplit.tcl M.tuberculosis.fas -size 200000 -overlap 10000 > M.tuberculosis_splitted
( the infile has to be in FASTA format)
(the script is also available at Molecular Tcl at
http://evolution.bmc.uu.se/~thomas/tcl
section: Tcl code fragments for sequence analysis )
good luck
-thomas
------ SCHNIPP ------ SCHNAPP ------ SCHNIPP ------ SCHNAPP ------
#!/bin/sh
# The next line is executed by /bin/sh, but not tcl \
exec tclsh $0 ${1+"$@"}
# Created: Fri Jun 12 08:21:20 1998
# Last changed: Time-stamp: <98/06/12 08:46:34 thomas>
# File: splitseq.tcl
# Usage splitseq.tcl -help
package require opt
::tcl::OptProc checkopts {
{file "FASTA file"}
{-size 1000 "split size in bp"}
{-overlap 100 "overlap in bp"}
} {
foreach i [info locals] {
if {$i=="args" || $i=="Args"} continue
set ::data($i) [ set $i]
}
}
if {[catch {eval checkopts $argv} msg]} { puts stderr $msg; exit}
set fid [ open $data(file) r]
gets $fid line
if { [ regexp "^>\[a-zA-Z0-9_-]+" $line name]==0} {
puts stderr "no valid FASTA file"
exit
}
if { $data(overlap)>=$data(size) } { puts stderr "wrong overlap value"; exit }
set seq {}
puts -nonewline stderr "reading sequence from $data(file)"
while { [gets $fid line]>=0 } {
append seq $line
}
regsub -all {[^a-zA-Z*-]} $seq {} seq
set length [ string length $seq]
puts stderr "\n\n\nlength: $length"
set diff [ expr $data(size) - $data(overlap)]
puts stderr "splitting in $data(size) with $data(overlap) overlap"
set to 0
for { set from 0} {$to<$length} {incr from $diff} {
set to [ expr $from + $data(size) -1]
set subseq [ string range $seq $from $to]
puts "$name:[ expr $from+1]:[ expr $to+1]"
regsub -all "............................................................" \
$subseq \&\n subseq
puts $subseq
}
------ SCHNIPP ------ SCHNAPP ------ SCHNIPP ------ SCHNAPP ------
--
Sicheritz Ponten Thomas E. Department of Molecular Biology
Biomedical Center Uppsala University
BMC: +46 18 4714214 BOX 590 S-751 24 UPPSALA Sweden
Fax +46 18 557723 http://evolution.bmc.uu.se/~thomas
Molecular Tcl: http://evolution.bmc.uu.se/~thomas/tcl
Molecular Linux: http://evolution.bmc.uu.se/~thomas/mol_linux
De Chelonian Mobile ... The Turtle Moves ...