IUBio

How to view overlapping sequences

Richard Durbin rd at sanger.ac.uk
Tue Nov 9 11:59:36 EST 1999


Here is the perl script we use to make the LINK* objects.  A bit
C.elegans specific I am afraid.

It assumes that there is a chain of Overlap_right specifiying the
order of sequences, with a value to the right if and only if there is
an actual overlap, the value being the start position of the
right-overlapping sequence within the current sequence.

Richard

---------------------------------------------------------------------------

#!/usr/local/bin/perl

##########################################################
#
# Makelinks program which will link up cosmid sequences and 
# spanning subsequences based on Overlap_left/right info and
# the current LINK_*.  Makes one LINK_XXX for each set of
# entries with DNA and Overlap_right set non-zero and
# no Confidential_remark "not in Cambridge LINK".  XXX is 
# the leftmost entry name.  SUPERLINK_CB_* are also made 
# based on a hardwired table of left ends.
#
# Performs a number of checks to make sure of the integrity 
# of the new links
#
# 1) Will check that all sequences that previously were in a link
#    are put back into a link.
# 2) Will check that all subsequences from links are put back into 
#    links.
#
#    WARNING: Current implementation relies on the fact that 
#    all current primary links have names starting with "LINK".
#
# RD 990712 total rewrite of Steven Jones original using AcePerl
# now handle reversed sequences with Flipped tag set.
# -dump and -load options to separate reading database from making result.
# -flip_subseqs option a bit subtle - when attaching spanning subsequences
#    based on flipped sequences, reverse attachment info.  This is correct
#    only once, when sequences are newly flipped (but how to know that in
#    the future?).
#
##########################################################

use Getopt::Long ;

&GetOptions("dump!","load=s", "flip_subseqs!") ;

if ($opt_load) {
    open (FILE, "$opt_load") || die "failed to open dump file $opt_load\n" ;
    while (<FILE>) {
	chop ;
	($type, $seq, @a) = split /\t/ ;
	if ($type eq "SEQ") {
	    ($length{$seq}, $right{$seq}, $rightOffset{$seq}, $left{$seq},
	     $isFlipped{$seq}, $isExternal{$seq}, $isLinkCandidate{$seq}) = @a ;
	    $isGenomeSequence{$seq} = 1 ;
	} elsif ($type == "LINK") {
	    ($currSource{$seq}, $currStart{$seq}, $currEnd{$seq}) = @a ;
	} else {
	    die "bad line in load file: $type\t$seq\t$_\n" ;
	}
    }
} else {

	# get data from camace

    use Ace ;
    $campath = glob("~/acedb/ace4/cam") ;
    $camdb = Ace->connect(-path=>"$campath") || 
	die "failed to connect to database\n" ;

	# first get lots of info about Genome_sequence objects

    $it = $camdb->fetch_many(Genome_sequence => '*') ;
    while ($obj = $it->next) {
	$isGenomeSequence{$obj} = 1 ;
	$length{$obj} = $obj->DNA(2) ;
	if (!$length{$obj}) { warn "No length for $obj\n" ; }
	$right{$obj} = $obj->Overlap_right ;
	$rightOffset{$obj} = $obj->Overlap_right(2) ;
	$left{$obj} = $obj->Overlap_left ;
	if ($obj->at('Structure.Flipped')) { $isFlipped{$obj} = 1 ; }
	foreach ($obj->Confidential_remark) {
	    if (/^not in Cambridge LINK$/) { $isExternal{$obj} = 1 ; }
	}
	$isLinkCandidate{$obj} = (!$isExternal{$obj} && $length{$obj} > 0) ;
    }

	# then some info about current links

    $it = $camdb->fetch_many(Sequence => 'LINK*') ;
    while ($obj = $it->next) {
	foreach $a ($obj->at('Structure.Subsequence')) {
	    ($seq, $start, $end) = $a->row ;
	    $currSource{$seq} = $obj ;
	    $currStart{$seq} = $start ;
	    $currEnd{$seq} = $end ;
	}
    }
}

if ($opt_dump) {
    foreach $obj (keys %length) {
	print "SEQ\t$obj\t$length{$obj}\t$right{$obj}\t$rightOffset{$obj}\t$left{$obj}\t" ;
	print "$isFlipped{$obj}\t$isExternal{$obj}\t$isLinkCandidate{$obj}\n" ;
    }
    foreach $seq (keys %currSource) {
	print "LINK\t$seq\t$currSource{$seq}\t$currStart{$seq}\t$currEnd{$seq}\n" ;
    }

    exit 0 ;		# exit after dumping
}

###########################################
# make links
###########################################

foreach $seq (keys %isGenomeSequence) {
				# only keep seeds
    next if (!$isLinkCandidate{$seq} ||
	     ($left{$seq} && $isLinkCandidate{$left{$seq}} && $rightOffset{$left{$seq}}) ||
	     !$rightOffset{$seq} ||
	     !$isLinkCandidate{$right{$seq}}) ;

				# print LINK header
    $lk = "LINK_$seq" ;
    print "\nSequence $lk\n" ;
    print "From_laboratory CB\n" ;

				# loop over subsequences
    $startright=1 ;
    while ($isLinkCandidate{$seq}) {
	if ($isFlipped{$seq}) {
	    $start{$seq} = $startright + $length{$seq} - 1 ; 
	    $end{$seq} = $startright ; 
	} else {
	    $start{$seq} = $startright ; 
	    $end{$seq} = $startright + $length{$seq} - 1 ;
	}
	print "Subsequence $seq $start{$seq} $end{$seq}\n" ;
	$link{$seq} = $lk ;
	if (!$rightOffset{$seq}) { last ; } # POSS EXIT FROM LOOP
	$startright = $startright + $rightOffset{$seq} - 1 ;
	$seq = $right{$seq} ;
    }		
}

###########################################
#Re-map subsequences back onto the new links 
###########################################

foreach $seq (keys %currStart) {
    next if ($isGenomeSequence{$seq}) ;
    if ($seq =~ /(\S+)\.\d+/) { $parent = $1 ; }
    else { warn "no dot in subsequence name $seq\n" ; next ; }
    if (!$currStart{$parent}) { warn "no coord in link for parent $parent of $seq\n" ; next ; }
    if (!($currSource{$seq} eq $currSource{$parent})) { warn "parent $parent and child $seq in different links\n" ; next ; }
    if (!$link{$parent}) { warn "no new link for parent $parent of $seq\n" ; next ; }
    if ($isFlipped{$parent} && $opt_flip_subseqs)  {
	$start{$seq} = $start{$parent} + $currStart{$parent} - $currStart{$seq} ;
	$end{$seq} =  $start{$parent} + $currStart{$parent} - $currEnd{$seq} ;
    } else {
	$start{$seq} = $start{$parent} - $currStart{$parent} + $currStart{$seq} ;
	$end{$seq} = $start{$parent} - $currStart{$parent} + $currEnd{$seq} ;
    }
    print "\nSequence $link{$parent}\n";
    print "Subsequence $seq $start{$seq} $end{$seq}\n" ;
    $link{$seq} = $link{$parent} ;
}

###########################################
#Do a quick check to make sure that everything 
#that was in a link has been put back in one
###########################################

foreach $seq (keys %currSource) {
    if (!$link{$seq}) { warn "$seq not put back into a link\n" ; }
}

###########################################
# make superlinks
###########################################

%superinfo = ( "I" => "H06O01",		# left ends of superlinks
	       "IR" => "Y95D11A",
	       "II" => "T05A6",
	       "IIIL" => "F45H7",
	       "IIIR" => "K01F9",
	       "IV" => "F21D5",
	       "V" => "F07D3",
	       "X" => "B0272"
	       ) ;

foreach $chrom (keys %superinfo) {
    print "\nSequence SUPERLINK_CB_$chrom\n" ;
    print "From_laboratory CB\n" ;
    $pos = 1 ;
    $seq = $superinfo{$chrom} ;
    while ($seq) {
	if ($link{$seq}) {
	    $lk = $link{$seq} ;
	    while ($seq && $lk eq $link{$seq}) {
		$end = $end{$seq} ;
		$seq = $right{$seq} ;
	    }
	    $end += $pos - 1 ;
	    print "Subsequence $lk $pos $end\n" ;
	} elsif ($isLinkCandidate{$seq}) {
	    $end = $pos - 1 + $length{$seq} ;
	    if ($isFlipped{$seq}) {
		print "Subsequence $seq $end $pos\n" ;
	    } else {
		print "Subsequence $seq $pos $end\n" ;
	    }
	    $seq = $right{$seq} ;
	} elsif (!$isExternal{$seq}) {
	    $end = $pos - 1 + 1000 ;
	    print "Subsequence $seq $pos $end\n" ;
	    $seq = $right{$seq} ;
	} else {
	    last ;
	}
	$pos = $end + 101 ;
    }
}

############# end of file ################
---





More information about the Acedb mailing list

Send comments to us at biosci-help [At] net.bio.net