#!/usr/bin/perl -w

# make_psmm.pl: make a position-specific Markov model (count k-mers)

# Written by Martin C Frith 2006
# Genome Exploration Research Group, RIKEN GSC and
# Institute for Molecular Bioscience, University of Queensland

# This program takes two parameters: a number indicating the "order"
# of the Markov model to make, and a list of sequences in multi-fasta
# format. The sequences should all have the same length or it will
# complain. The program simply counts how often each k-mer occurs at
# each position, where k = order+1. (This is standard Markov model
# notation, don't blame me.) The results are printed as a matrix, with
# the k-mer in the leftmost column, and the counts in each position in
# the following columns.

use strict;
use File::Basename;

die "Usage: ", basename($0), " MM-order sequences.fa\n" unless @ARGV;
my $order = shift;
my $word_size = $order + 1;

my %freq;  # store the k-mer counts here
my $windows;  # the number of k-mers that fit into a sequence

my $seq = '';
while (<>) {
    if (/^>/) {
	do_it();
	$seq = '';
    } else {
	$seq .= $_;
    }
}
do_it();

for my $kmer (sort keys %freq) {
    print $kmer;
    my $f = $freq{$kmer};
    for (my $i = 0; $i < $windows; ++$i) {
	$$f[$i] = 0 unless defined $$f[$i];
	print "\t", $$f[$i];
    }
    print "\n";
}

sub do_it {
    $seq =~ tr/a-zA-Z//cd;  # remove non-alphabetic characters (e.g. newlines)
    return unless $seq;
    $seq = lc $seq;  # convert the sequence to lower-case

    my $w = length($seq) - $word_size + 1;
    die "unequal sequence lengths" if defined $windows and $w != $windows;
    $windows = $w;

    for (my $i = 0; $i < $w; ++$i) {
	my $kmer = substr $seq, $i, $word_size;
	# keep k-mers containing ambiguous letters, so column sums are constant
	++$freq{$kmer}[$i];
    }
}
