#!/usr/bin/perl -w

# paraclu.pl: perform parametric clustering of data attached to sequences

# Written by Martin C Frith 2006
# Genome Exploration Research Group, RIKEN GSC and
# Institute for Molecular Bioscience, University of Queensland

# This program reads in a list of numeric values attached to positions
# in sequences. The list should have four tab- (or space-) separated
# columns containing: the sequence name, the strand, the position, and
# the value. (Multiple values for the same sequence/strand/position
# will be summed.) It outputs the clusters as eight tab-separated
# columns: sequence name, strand, start, end, number of values, sum of
# values, min d, max d. See below for the meaning of "d".

# An example line of input:
# chr1    +       17689   3
# Clustering is performed separately for different strands (as if each
# strand were a completely different sequence).  It does not matter
# whether the position uses 0-based or 1-based coordinates: the
# program does not care, and the output will be consistent with the
# input.

# The clusters are defined as follows. A cluster is a maximal scoring
# segment, where the score of any segment is: the sum of the values in
# the segment minus d times the size of the segment. Large values of d
# give smaller, tighter clusters and small values of d give larger,
# looser clusters. The program finds all possible clusters for any
# value of d, and annotates each cluster with the maximum and minimum
# values of d that produce it. The ratio max d / min d provides a
# measure of the cluster's "stability".

# The output will include two types of obvious/trivial/degenerate
# clusters: those that cover single positions, and those that cover
# all of the positions in a sequence.  For many purposes, it would be
# best to ignore these cases.

use strict;
use List::Util qw(min max);

my %data;

warn "reading...\n";

while (<>) {
    s/#.*//;  # ignore comments
    next unless /\S/;  # skip blank lines

    my ($seq, $strand, $pos, $value) = split;
    my $key = "$seq $strand";
    push @{$data{$key}}, [ $pos, $value ];
}

warn "clustering...\n";

print "# sequence, strand, start, end, sites, sum of values, min d, max d\n";

for my $key (sort keys %data) {  # iterate over sequences / strands
    my ($seq, $strand) = split " ", $key;
    my $sites = $data{$key};

    @$sites = sort { $$a[0] <=> $$b[0] } @$sites;  # sort by position

    my $clusters = all_clusters($sites);

    for my $c (@$clusters) {
	my ($beg, $end, $tot, $sit, $min, $max) = @$c;
	my $beg_pos = $$sites[$beg][0];
	my $end_pos = $$sites[$end][0];
	printf "$seq\t$strand\t$beg_pos\t$end_pos\t$sit\t$tot\t%.3g\t%.3g\n",
	$min, $max;
    }
}

### Generic code to find clusters in a sparse sequence of values: ###

sub all_clusters {
    our $inf = 1e100;  # hopefully much bigger than any value in the input
    our $sites = shift;  # input: reference to array of site locations & values
    our $clusters = [];  # output: reference to array of clusters
    get_clusters(0, $#$sites, -$inf);
    return $clusters;
}

# get clusters of sites between beg and end with density > min_density
sub get_clusters {
    our ($clusters, $inf);
    my ($beg, $end, $min_density) = @_;

    my ($prefix, $pmin, $ptot, $psit) = weakest_prefix($beg, $end);
    my ($suffix, $smin, $stot, $ssit) = weakest_suffix($beg, $end);
    $ptot == $stot and $psit == $ssit or die "internal error!";
    my $max_density = min $pmin, $smin;

    unless ($max_density == $inf) {
	my $break = $pmin < $smin ? $prefix + 1 : $suffix;
	my $new_min = max $min_density, $max_density;
	get_clusters($beg, $break-1, $new_min);
	get_clusters($break, $end, $new_min);
    }

    push @$clusters, [ $beg, $end, $ptot, $psit, $min_density, $max_density ]
	if $max_density > $min_density;
}

# get least dense prefix (and total of values & sites)
sub weakest_prefix {
    our ($sites, $inf);
    my ($beg, $end) = @_;

    my $beg_pos = $$sites[$beg][0];
    my $min_density = $inf;
    my $min_prefix = $end;
    my $tot = 0;
    my $sit = 0;

    for (my $i = $beg; $i < $end; ++$i) {
	$tot += $$sites[$i][1];
	next if $$sites[$i][0] == $$sites[$i+1][0];  # idiot-proofing
	++$sit;
	my $dist = $$sites[$i+1][0] - $beg_pos;
	my $density = $tot / $dist;
	if ($density < $min_density) {
	    $min_prefix = $i;
	    $min_density = $density;
	}
    }

    $tot += $$sites[$end][1];
    ++$sit;
    return ($min_prefix, $min_density, $tot, $sit);
}

# get least dense suffix (and total of values & sites)
sub weakest_suffix {
    our ($sites, $inf);
    my ($beg, $end) = @_;

    my $end_pos = $$sites[$end][0];
    my $min_density = $inf;
    my $min_suffix = $beg;
    my $tot = 0;
    my $sit = 0;

    for (my $i = $end; $i > $beg; --$i) {
	$tot += $$sites[$i][1];
	next if $$sites[$i][0] == $$sites[$i-1][0];  # idiot-proofing
	++$sit;
	my $dist = $end_pos - $$sites[$i-1][0];
	my $density = $tot / $dist;
	if ($density < $min_density) {
	    $min_suffix = $i;
	    $min_density = $density;
	}
    }

    $tot += $$sites[$beg][1];
    ++$sit;
    return ($min_suffix, $min_density, $tot, $sit);
}