#!/usr/bin/perl

if(@ARGV < 3) {
 
    print ("\nUSAGE: parse_links.pl <scaff> <graph> <outdir>\n\n".
	   "scaff\t= File containing scaffold information in .scaff format\n".
           "graph\t= File containing contig graph information similar to Newbler output\n".
           "outdir\t= Directory to which the files \"result.scaff\" and \"log\" will be written\n\n"); 
 
    exit;
}

($scaff, $graph, $outdir, $fast) = @ARGV;

sub flip {

    local($text) = @_;
    $text =~ s/5\'/6\'/; $text =~ s/3\'/5\'/; $text =~ s/6\'/3\'/;
    return $text;
}

%memory = (); $max_search_depth = 15;
sub explore_path {

    local($ctg1, $side1, $ctg2, $side2, *links, $depth, *contigs, %seen) = @_;
    local(@path) = ("not found");

    return @path if($depth > $max_search_depth);
    return @{$memory{"$ctg1 $side1 $ctg2 $side2"}} if(exists $memory{"$ctg1 $side1 $ctg2 $side2"});
 
    foreach $next (keys(%{$links{"$ctg1 $side1"}})) {

	($id, $side) = split(/ /, $next);

	if(!$seen{$id}) {

	    if($id eq $ctg2 && $side eq $side2) {

		@path = ();
		last;
	    }

	    next if($contigs{sprintf("contig%05d", $id)});
	    local(@subpath) = &explore_path($id, &flip($side), $ctg2, $side2, *links, $depth+1, *contigs, ($id => 1, %seen));	

	    if($subpath[0] ne "not found" && ($path[0] eq "not found" || @path > @subpath+1)) {    

		unshift @subpath, $next;
		@path = @subpath;
	    } 
	}
    }

    @{$memory{"$ctg1 $side1 $ctg2 $side2"}} = @path if($path[0] ne "not found");

    return @path;
}

sub path2scaff {

    local($ctg1, $side1, $ctg2, $side2, *links, *path, *length, *contigs) = @_;
    local($scaff, $evidence) = ();
    
    unshift @path, &flip("$ctg1 $side1");
    for $i (1..$#path) {

	($id, $side) = split(/ /, $path[$i]); 
	$scaff .= sprintf("contig%05d %s %d 0\n", $id, ($side eq '5\'' ? "BE" : "EB"), $length{sprintf("contig%05d", $id)});
	$evidence .= sprintf("<%d links> contig%05d,Size=%d%s ", $links{&flip($path[$i-1])}{$path[$i]}, $id, $length{sprintf("contig%05d", $id)},
	    ($contigs{sprintf("contig%05d", $id)} ? "**" : ""));
    }

    $evidence .= sprintf("<%d links>", $links{&flip($path[$#path])}{"$ctg2 $side2"});
    return ($scaff, $evidence);
}

print "Step 1: Reading the graph in.\n";

open(GRAPH, $graph);

$links = 0; $total_coverage = 0; $count = 0; %links = ();
while($line = <GRAPH>) {

    @data = split(/\s+/, $line);

    if($data[0] ne 'C') {

	($index, $id, $length, $coverage) = @data;
	$id_map{$index} = $id; $coverage{$id} = $coverage; $length{$id} = $length;
	($total_coverage, $count) = ($total_coverage+$coverage, $count+1) if($length > 1000);
    }

    if($data[0] eq 'C' && $links == 0) {

	$avg_coverage = $total_coverage/$count;
	foreach $id (keys(%coverage)) {

	    $copy_number{$id} = int($coverage{$id}/$avg_coverage + 0.5);
	}
    }

    if($data[0] eq 'C') {

	($c, $id1, $side1, $id2, $side2, $reads) = @data;

	$links++;
	$links{"$id1 $side1"}{"$id2 $side2"} = $reads;
	$links{"$id2 $side2"}{"$id1 $side1"} = $reads;
    }
}
close(GRAPH);

open(LOG, ">$outdir/log");

print "Step 2: Comparing the graph to the scaffold.\n";
open(SCAFF, $scaff);

$header = <SCAFF>;
$last = <SCAFF>; 
%contigs = ();
map {$contigs{$_} = 1} (`cat $scaff` =~ m/(\S+) (BE|EB)/g);
$closed = 0;

while($current = <SCAFF>) {

    ($id1, $or1, $size1, $offset1, @rest1) = split(/\s+/, $last);
    ($id2, $or2, $size2, $offset2, @rest2) = split(/\s+/, $current);
    $side1 = ($or1 eq "BE" ? '3\'' : '5\'');
    $side2 = ($or2 eq "BE" ? '5\'' : '3\'');
    $id1 =~ m/contig0+(\d+)/; $ctg1 = $1;
    $id2 =~ m/contig0+(\d+)/; $ctg2 = $1;

    @path = &explore_path($ctg1, $side1, $ctg2, $side2, *links, 0, *contigs, ());
    if($path[0] ne "not found") {

	($scaff, $evidence) = &path2scaff($ctg1, $side1, $ctg2, $side2, *links, *path, *length, *contigs);

	if(!$fast) {

	    print "Gap between $id1 & $id2 can be closed (Gap Size = $offset1).\nContig Graph Path: $evidence\n";
	    print "Close Gap? (Y/N): ";
	}

	if($fast ||<STDIN> =~ m/Y/) {

	    $offset1 = 0; $scaff_text .= $scaff;
	    $scaff_text .= "$id1 $or1 $size1 $offset1 @rest1\n";
	    $closed++;
	}
    }
    else {

	$scaff_text .= "$id1 $or1 $size1 $offset1 @rest1\n";
    }

    $last = $current; 
}

$scaff_text .= "$id2 $or2 $size2 $offset2 @rest2\n";
close(SCAFF);

print "Step 3: Identifying contigs placed in the scaffold that are likely to be repeats.\n";

foreach $contig (keys(%contigs)) {

    if($length{$contig} > 1000 & $copy_number{$contig} > 1) {

	@scaff_matches = ($scaff_text =~ m/\n$contig (BE|EB)/g); 
	printf "$contig (in-scaffold = %d, copy-number = $copy_number{$contig})\n", scalar(@scaff_matches); 
    }
}

$count = 0; $total = 0; $gaps = 0;
map { $count++; ($id, $or, $size, $offset) = split(/\s+/, $_); $total += $size; $gaps += $offset } split(/\n/, $scaff_text);

print "Successfully closed $closed gaps!\n";

open(SCAFF, ">$outdir/result\.scaff");
printf SCAFF ">all_seq $count $total %d\n", ($total+$gaps);
print SCAFF $scaff_text;
close(SCAFF);
close(LOG);
