################################################################################
#
#  kPerl Sequence Laboratory
#  Library for Sequence Comparison
#
#  copyright (c)
#    Karol Szafranski, 2007
#    Institute of Molecular Biotechnology Jena, Dept. Genome Analysis, 2001
#  author
#    Karol Szafranski, karol.szafranski@leibniz-fli.de
#
################################################################################
#
#  DESCRIPTION
#
# - individual description of functions at the beginning of the code blocks
#
################################################################################
#
#  FUNCTIONS, DATA
#
#   @EXPORT
#
# - basic sequence comparison
#   &seqstr_overlap
#
# - matrix construction
#   &MatrixSimrelCalc
#
# - matrix comparison
#   %CompMatrix  (not exported)
#   &CompMatrixAccess
#   &CompMatrixMean
#
#
#  STD OPTIONS
#
#   -debug      print debug protocol to STDERR
#
################################################################################

package SeqLab::SeqComp;

# includes
use strict; #use warnings;  # OK 20070926
use MainLib::StrRegexp qw($reEndl);
use MainLib::Data qw(&DataClone &DataPrint);
use MainLib::Misc qw(&MySub);
use Math::kCalc;
use database::DbPlain;
use SeqLab::SeqBench qw(&SeqStrRevcompl);

# symbol export
use Exporter qw(import);
our @EXPORT = qw (
  &seqstr_overlap
  &MatrixSimrelCalc
  &CompMatrixAccess &CompMatrixMean
  );


################################################################################
# basic sequence comparison
################################################################################


sub seqstr_revcompl { return &SeqStrRevcompl(@_) }

# identify maximum overlap between two sequences
# * taken from miRNA/pipeline/fold_compare.pl, not yet in use anywhere in kPerl *
#
# INTERFACE
# - argument 1: sequence string 1
# - argument 2: sequence string 2
#
# - return val: reference to array of
#               - overlap length
#               - overlap orientation
#               - start in sequence 1
#               - start in sequence 2
#
# DESCRIPTION
# - If multiple overlaps have the same maximal length, the returned winner is
#   randomly picked from these.
#
sub seqstr_overlap {
  my $wlen=5;

  # parameters
  my @seq = map{ uc() } @_;
  my @len = map{ length() } @seq;
  my ($ishort,$ilong) = sort{ $len[$a]<=>$len[$b] } 0,1;
  my $seqshrev = &seqstr_revcompl($seq[$ishort]);

  my ($wseq,$pos,@match);
  # k-mer #1, sense-left
  $wseq=substr($seq[$ishort],0,$wlen);
  $pos=0; while (($pos=index($seq[$ilong],$wseq,$pos)) >= 0) {
    my $l = &Min($len[$ishort],$len[$ilong]-$pos);
    if (substr($seq[$ishort],0,$l) eq substr($seq[$ilong],$pos,$l)){
      push @match, [$l,+1,0,$pos];
    }
    ++$pos;
  }
  # k-mer #2, sense-right
  $wseq=substr($seq[$ishort],-$wlen,$wlen);
  $pos=0; while (($pos=index($seq[$ilong],$wseq,$pos)) >= 0) {
    my $l = &Min($len[$ishort],$pos+$wlen);
    if (substr($seq[$ishort],$len[$ishort]-$l,$l) eq substr($seq[$ilong],$pos+$wlen-$l,$l)){
      push @match, [$l,+1,$len[$ishort]-$l,$pos+$wlen-$l];
    }
    ++$pos;
  }
  # k-mer #3, antisense-left
  $wseq=substr($seqshrev,0,$wlen);
  $pos=0; while (($pos=index($seq[$ilong],$wseq,$pos)) >= 0) {
    my $l = &Min($len[$ishort],$len[$ilong]-$pos);
    if (substr($seqshrev,0,$l) eq substr($seq[$ilong],$pos,$l)){
      push @match, [$l,-1,$len[$ishort]-$l,$pos];
    }
    ++$pos;
  }
  # k-mer #4, antisense-right
  $wseq=substr($seqshrev,-$wlen,$wlen);
  $pos=0; while (($pos=index($seq[$ilong],$wseq,$pos)) >= 0) {
    my $l = &Min($len[$ishort],$pos+$wlen);
    if (substr($seqshrev,$len[$ishort]-$l,$l) eq substr($seq[$ilong],$pos+$wlen-$l,$l)){
      push @match, [$l,-1,0,$pos+$wlen-$l];
    }
    ++$pos;
  }

  # determine winning overlap
  my $win=[0,undef,0,0];
  foreach (@match){
    if ($_->[0]>$win->[0]){ $win=$_ }
  }
  # switch coordinate fields if 2nd sequence is the shortest
  if ($ishort){
    my $pos=pop @$win;
    splice @$win,2,0,$pos;
  }
  return $win;
}


################################################################################
# matrix construction
################################################################################


# output amino acid similarity matrix
#
# INTERFACE
# - options:
#   -debug      [STD]
#   -thresh     similarity threshold, default: 0
#   -table      matrix table, default: BLOSUM40
#
# - return val: - reference to matrix table, 
#               - undef if an error occurs
#
# DESCRIPTION
# - we have to copy the matrix data structure in order not to change the
#   matrix data in %CompMatrix
#
sub MatrixSimrelCalc {

  # function constants
  my @SmbSkip = qw(B X Z *);

  # function parameters
  my (%opt) = @_;
  my $debug = $opt{-debug};
  my $TableID = $opt{-table} || 'BLOSUM40';

  # load comparison matrix
  my $pMatrix = &CompMatrixAccess ($TableID, -debug=>$opt{-debug});
  unless ($pMatrix) {
    $debug and printf STDERR "%s. ERROR: no matrix table %d\n", &MySub, $TableID;
    return undef; 
  }
  $pMatrix = &DataClone ($pMatrix);

  # recalculate matrix, loop over 1st-dim symbol
  # - delete non-seq symbols
  # - determine max. similarity
  my ($pLine,$SimMax);
  map { delete $$pMatrix{$_} } @SmbSkip;
  foreach my $ItSmb (sort keys %$pMatrix) {
    $pLine = $$pMatrix{$ItSmb};
    map { delete $$pLine{$_} } @SmbSkip;
    $SimMax = &Max ($SimMax, map { $$pLine{$_} } grep { $_ ne $ItSmb } keys %$pLine);
  }
  $debug and printf STDERR "%s. max. rel. similarity: %.3f\n", &MySub, $SimMax;

  # recalculate matrix for SimMax = 1
  foreach $pLine (values %$pMatrix) {
    map { $$pLine{$_} = ($$pLine{$_}>0) ? $$pLine{$_}/$SimMax : 0 } keys %$pLine;
  }
  if ($debug) {
   printf STDERR "    %s\n", join ("    ", sort keys %$pMatrix);
   foreach my $ItSmb (sort keys %$pMatrix) {
     $pLine = $$pMatrix{$ItSmb};
     printf STDERR "%s %s\n", $ItSmb, join ('',
       map { sprintf "%5.2f", $$pLine{$_}; }
       sort keys %$pLine);
   }
  }

  # exit SUB successfully
  return $pMatrix; 
}


################################################################################
# matrix comparison
################################################################################


# matrix table(s)
our %CompMatrix = ();


# return similarity matrix
#
# INTERFACE
# - argument 1: matrix source file, default search paths:
#               - $ENV{BLASTMAT}/*/
#               - ./
#
# - options:
#   -debug      [STD]
#
# - return val: reference to matrix table
#
# DESCRIPTION
# - currently, only BLAST matrix tables are supported
# - if called the first time, the translation table is loaded from a source
#   file.
#
sub CompMatrixAccess {
  my ($SrcArg,%opt) = @_;
  my $debug = $opt{-debug};

  # matrix table already present?
  unless (exists $CompMatrix{$SrcArg} and %{$CompMatrix{$SrcArg}}) {

    # find source file in search paths
    my (@SrcPossib);
    if ($SrcArg !~ m/^\//) {
      push @SrcPossib, glob ("$ENV{BLASTMAT}/*/$SrcArg")
    }
    push @SrcPossib, $SrcArg;
    my ($SrcFile) = grep { -r $_ } sort @SrcPossib;
    unless ($SrcFile) {
      die sprintf "%s. ERROR: unable to read matrix file %s\n", &MySub, $SrcArg;
    }

    # load matrix table from source file
    $CompMatrix{$SrcArg} = &PlainToTable ($SrcFile,
      -TabType=>'HIH', -delimit=>{'line'=>$reEndl,'col'=>'(\t| +)'},
      -ColShift=>1, -FltComment=>1);
    unless ($CompMatrix{$SrcArg}) {
      die sprintf "%s. ERROR: unable to read matrix file %s\n", &MySub, $SrcFile;
    }

    # refine matrix table structure
    $debug and printf STDERR "%s. unusual columns: %s\n", &MySub,
      map {"'$_' "} grep { $_!~m/^[a-z*-]$/i; } keys %{$CompMatrix{$SrcArg}};
    foreach my $ItSmb (keys %{$CompMatrix{$SrcArg}}) {
      my $pLine = $CompMatrix{$SrcArg}{$ItSmb};
      map { delete $$pLine{$_} } grep { $_ !~ m/^[a-z*]$/i; } keys %$pLine;
    }
  }

  # exit SUB successfully
  return $CompMatrix{$SrcArg}; 
}


# return amino acid translation of nucleotide sequence
#
# INTERFACE
# - argument 1: reference to array of aligned symbols
#
# - options:
#   -debug      [STD]
#   -table      matrix table, default: BLOSUM40
#
# - return val: - mean sequence distance
#               - undef if an error occurred
#
sub CompMatrixMean {
  my ($pSmb,%opt) = @_;
  my $debug = $opt{-debug};
  my $TableID = $opt{-table} || 'BLOSUM40';
  my $pCompMatrix = &CompMatrixAccess ($TableID, -debug=>$opt{-debug});
    # failure causes die

  # loop over symbol pairs
  my @DistSample;
  for (my $i=0; $i<@$pSmb; ++$i) {
    my $cSmbA = $$pSmb[$i];
    for (my $j=$i+1; $j<@$pSmb; ++$j) {
      my $cSmbB = $$pSmb[$j];

      # determine distance between pair of symbols
      push @DistSample, $$pCompMatrix{$cSmbA}{$cSmbB};
    }
  }
  $debug and printf STDERR "%s. mean distance %f for symbol set: %s\n", &MySub,
    &Mean(@DistSample), join(' ',@$pSmb);

  # exit SUB successfully
  return &Mean(@DistSample);
}


1;
# $Id: SeqComp.pm,v 1.13 2018/06/05 18:02:56 szafrans Exp $
