################################################################################
#
#  kPerl Sequence Laboratory
#  Library for Sequence Comparison
#
#  copyright (c)
#    Institute of Molecular Biotechnology Jena, Dept. Genome Analysis, 1998-2001
#  author
#    Karol Szafranski, karol.szafranski@leibniz-fli.de
#
################################################################################
#
#  DESCRIPTION
#
# - individual description of functions at the beginning of the code blocks
#
################################################################################
#
#  FUNCTIONS, DATA
#
#   @EXPORT
#   %_LibParam  (not exported)
#
# - sequence comparison - find overlaps
#   &PrimaryOverlap  (not exported)
#   &EndOverlap  (not exported)
#   &SeqStretchRepair
#
#
#  STD OPTIONS
#
#   -debug      print debug protocol to STDERR
#
################################################################################

package SeqLab::SeqComp;

# includes
use strict; #use warnings;  # OK 2003xxxx
use MainLib::Misc qw(&MySub);
use Math::kCalc qw(&Min &Max);
use SeqLab::SeqBench qw(&SeqStrPure &SeqStrRevcompl);

# symbol export
use Exporter qw(import);
our @EXPORT = qw (
  &SeqStretchRepair
  );

# package-wide constants and variables
my %_LibParam;


################################################################################
# sequence comparison
################################################################################


# word length in comparisons
$_LibParam{WordLen} = 10;


# report primary end overlaps in a pair of sequences
#
# INTERFACE
# - argument 1: reference to sequence A
# - argument 2: reference to sequence B
#
# - options:
#   -debug      print debug protocol to STDOUT
#   -length     minimum length of overlaps to search for (default:
#               $_LibParam{WordLen})
#
# - return val: - 1: primary identity found
#               - undef: no primary identity
#
sub PrimaryOverlap {
  my ($pSeqA,$pSeqB,%opt) = @_;
  my $WordLen = (int $opt{-length}) || $_LibParam{WordLen};

  # ensure SeqA < SeqB
  my $SeqALen = length $$pSeqA;
  my $SeqBLen = length $$pSeqB;
  if ($SeqBLen < $SeqALen) {
    ($pSeqA, $pSeqB)   = ($pSeqB, $pSeqA);
    ($SeqALen, $SeqBLen) = ($SeqBLen, $SeqALen);
  }
  if ($SeqALen < $WordLen) { return undef }

  # make search strings from ends of SeqA (shorter of both sequences)
  my ($StretchA,$StretchB);
  my @sSearch =
    ($StretchA = substr ($$pSeqA, 0, $WordLen)),
    &SeqStrRevcompl ($StretchA),
    ($StretchB = substr ($$pSeqA, -$WordLen, $WordLen)),
    &SeqStrRevcompl ($StretchB);

  # do primer search
  foreach (@sSearch) {
    if (index ($$pSeqB, $_) + 1) {
      return 1;
    }
  }

  # no success
  return undef;
}


# search for end overlaps in a pair of sequences
#
# INTERFACE
# - argument 1: reference to sequence A
# - argument 2: reference to sequence B
#
# - options:
#   -debug      print debug protocol to STDOUT
#   -length     minimum length of overlaps to search for (default: package
#               $_LibParam{WordLen})
#   -overlap    search only for true overlap and identity cases
#
# - return val: reference to result structure (description above)
#
# DESCRIPTION
# - result structure:
#   array of hashes, each hash representing an overlap case
#    @Result->%Entry
#   hash keys for overlap entry:
#    case     temporary information about the search case (one of four) which
#             caused the primary hit
#    length   length of overlap
#    offa     offset of overlap region in sequence a, positive strand
#             value in biological position (starting at 1)
#    offb     offset of overlap region in sequence b, positive strand
#             value in biological position (starting at 1)
#    strand   relative strandedness in overlap region:
#             F  forward / forward
#             R  forward / reverse-complement
#    type     classification type of overlap case:
#             complete identity       SeqA is equal to SeqB
#             contemplating identity  SeqA contains SeqB
#             included identity       SeqA is included in SeqB
#             overlap                 SeqA is overlapping with SeqB
#             divergent overlap       SeqA is divergently overlapping with SeqB
# - all overlap entries are ordered by length.
# - note that not nearly all possible divergent overlap cases are recovered.
#   A case is recovered only if there's an end stretch of the shorter sequence
#   being at least $WordLen bp identical to the longer sequence
# - If the identity range is palindromic in nature it's reported only once
#   though.
#
# DEBUG, CHANGES, ADDITIONS
# - internal position counters should be counted in computational system, not
#   in biological system, shouldn't they?
#
sub EndOverlap {
  my ($pSeqA, $pSeqB, %opt) = @_;
  my $debug = $opt{-debug};
  my $WordLen = int($opt{-length}) || $_LibParam{WordLen};

  my ($SearchPos, @Result, $SeqArevcompl, $StretchA, $StretchB);
  my ($ScanDirectA, $ScanPosA, $ScanDirectB);
  my $i;

  # ensure SeqA < SeqB
  my $SeqALen = length $$pSeqA;
  my $SeqBLen = length $$pSeqB;
  my $bExchange;
  if ($SeqBLen < $SeqALen) {
    ($pSeqA, $pSeqB)   = ($pSeqB, $pSeqA);
    ($SeqALen, $SeqBLen) = ($SeqBLen, $SeqALen);
    $bExchange = 1;
  }
  if ($SeqALen < $WordLen) { return [()]; }

  # make search strings from ends of SeqA (shorter of both sequences)
  my @sSearch =
    ($StretchA = substr ($$pSeqA, 0, $WordLen)),
    &SeqStrRevcompl ($StretchA),
    ($StretchB = substr ($$pSeqA, -$WordLen, $WordLen)),
    &SeqStrRevcompl ($StretchB);

  # do primer search
  if ($debug) {
    printf STDERR "%s. Looking for overlaps with primer length %d\n", &MySub,
      $WordLen;
    if ($bExchange) {
      printf STDERR "%s. Sequences exchanged\n", &MySub;
    }
    printf STDERR "%s. Sequence lengths: SeqA %d, SeqB %d\n", &MySub,
      $SeqALen, $SeqBLen;
  }
  for ($i=0; $i<@sSearch; $i++) {
    $SearchPos = 0;
    while ($SearchPos = index ($$pSeqB, $sSearch[$i], $SearchPos) + 1) {
      $debug and printf STDERR "%s. primary hit case %d, SeqB position %d\n", &MySub,
        $i, $SearchPos;
      push @Result, {
        case   => $i,
        offa   => ($i<2)? 1:0,
        offb   => $SearchPos,
        strand => ($i==0 or $i==2)? 1:-1,
        };
    }
  }

  # debug protocol for primary results
  if ($debug) {
    printf STDERR "%s. Primary results\n", &MySub;
    for ($i=0; $i<@Result; $i++) {
      printf STDERR "  overlap %d: hit case %d, rel. orientation %s, offsets A=%d B=%d\n",
        $i+1, $Result[$i]{case}, $Result[$i]{strand},
        $Result[$i]{offa}, $Result[$i]{offb};
    }
  }

  # no primary hits? => exit
  if (! @Result) { return [()] }
  $SeqArevcompl = &SeqStrRevcompl ($$pSeqA);

  # collect true overlaps
  if ($opt{-overlap}) {
    $debug and printf STDERR "%s. Secondary search\n", &MySub;
    for ($i=0; $i<@Result; $i++) {
      $Result[$i]{length} =
        ($Result[$i]{case} == 0 or $Result[$i]{case} == 3) ?
        $SeqBLen - $Result[$i]{offb} + 1 :
        $Result[$i]{offb} + $WordLen - 1;
      $Result[$i]{length} = &Min ($Result[$i]{length}, $SeqALen);
      if (! $Result[$i]{offa}) {
        $Result[$i]{offa} = $SeqALen - $Result[$i]{length} + 1;
      }
      if ($Result[$i]{case} == 1 or $Result[$i]{case} == 2) {
        $Result[$i]{offb} -= $Result[$i]{length} - $WordLen;
      }
      $debug and printf STDERR "overlap %d: hit case %d, length %d, rel. orientation %s, offsets A=%d B=%d\n",
        $i+1, $Result[$i]{case}, $Result[$i]{length}, $Result[$i]{strand},
        $Result[$i]{offa}, $Result[$i]{offb};
      $StretchA = substr (
        ($Result[$i]{strand} > 0) ? $$pSeqA : $SeqArevcompl,
        ($Result[$i]{strand} > 0) ? $Result[$i]{offa} - 1 :
          -$Result[$i]{offa} - $Result[$i]{length} + 1,
        $Result[$i]{length});
      $StretchB = substr ($$pSeqB,
        $Result[$i]{offb} - 1,
        $Result[$i]{length});
      if ($StretchA ne $StretchB) {
        splice @Result, $i, 1;
        $i --;
      }
    }
  }

  # collect all overlaps
  else {
    for ($i=0; $i<@Result; $i++) {
      $Result[$i]{length} = $WordLen;
      if ($Result[$i]{offa}) {
        $ScanDirectA = 3;
      } else {
        $Result[$i]{offa} = $SeqALen - $WordLen + 1;
        $ScanDirectA = 5;
      }
      $ScanDirectB =
        ($Result[$i]{case}==0 or $Result[$i]{case}==3) ? 3 : 5;
      while (1) {

        # prepare next base positions to compare
        $ScanPosA = ($ScanDirectA == 5) ?
          $Result[$i]{offa} - 2 :
          $Result[$i]{offa} + $Result[$i]{length} - 1;
        if ($Result[$i]{strand} < 0) {
          $ScanPosA = $SeqALen - $ScanPosA - 1;
        }
        $StretchA = substr (
          ($Result[$i]{strand} > 0) ? $$pSeqA : $SeqArevcompl,
          $ScanPosA, 1);
        $StretchB = substr ($$pSeqB,
          ($ScanDirectB == 5) ?
            $Result[$i]{offb} - 2 :
            $Result[$i]{offb} + $Result[$i]{length} - 1,
          1);

        # compare
        if (!$StretchA or !$StretchB or $StretchA ne $StretchB) { last }

        # success => extend identity range
        $Result[$i]{length} ++;
        if ($ScanDirectA==5) { --$Result[$i]{offa} }
        if ($ScanDirectB==5) { --$Result[$i]{offb} }
        if (($ScanDirectA==5 and $Result[$i]{offa}==1) or
            ($ScanDirectB==5 and $Result[$i]{offb}==1)
           )
          { last }
      }
    }
  }

  # sort cases of overlaps
  @Result = sort {
    $b->{length} <=> $a->{length} or
    $a->{offa}   <=> $b->{offa} or
    $b->{strand} <=> $a->{strand} or
    $a <=> $b;
    } @Result;

  # fine workout of cases
  for ($i=0; $i<@Result; $i++) {

    # delete following redundant reports
    if ($Result[$i]{length} == $SeqALen) {
      if ($SeqALen == $SeqBLen) {
        $Result[$i]{type} = 'full identity';
      } elsif ($bExchange) {
        $Result[$i]{type} = 'contemplating identity';
      } else {
        $Result[$i]{type} = 'included identity';
      }
    } elsif (($Result[$i]{strand}>0 and $Result[$i]{offa}==1 and $Result[$i]{offb}+$Result[$i]{length}==$SeqBLen+1) or
             ($Result[$i]{strand}>0 and $Result[$i]{offb}==1 and $Result[$i]{offa}+$Result[$i]{length}==$SeqALen+1) or
             ($Result[$i]{strand}<0 and $Result[$i]{offa}==1 and $Result[$i]{offb}==1) or
             ($Result[$i]{strand}<0 and $Result[$i]{offa}+$Result[$i]{length}==$SeqALen+1 and $Result[$i]{offb}+$Result[$i]{length}==$SeqBLen+1)
            ) {
      $Result[$i]{type} = 'overlap';
    } else {
      $Result[$i]{type} = 'divergent overlap';
    }

    # delete following redundant reports
    for (my $j=$i+1; $j<@Result; $j++) {
      if ($Result[$i]{length}==$Result[$j]{length} and
          $Result[$i]{offa}  ==$Result[$j]{offa} and
          $Result[$i]{offb}  ==$Result[$j]{offb}
         ) {
        splice @Result, $j, 1;
        $j --;
      }
    }

    # correct sequence identity in case of exchange
    if ($bExchange) {
      ($Result[$i]{offa}, $Result[$i]{offb}) = ($Result[$i]{offb}, $Result[$i]{offa});
    }

    # delete temporary information
    delete $Result[$i]{case};
    delete $Result[$i]{directa};
    delete $Result[$i]{directb};
  }

  # debug protocol for results
  if ($debug) {
    printf STDERR "%s. Final results\n", &MySub;
    for ($i=0; $i<@Result; $i++) {
      printf STDERR "  overlap %d: length %d, rel. orientation %s, offsets A=%d B=%d, type %s\n",
        $i+1, $Result[$i]{length}, $Result[$i]{strand},
        $Result[$i]{offa}, $Result[$i]{offb},
        $Result[$i]{type};
    }
  }

  # exit SUB
  return \@Result;
}


# correct corrupt sequence stretch
#
# INTERFACE
# - argument 1: original sequence string
# - argument 2: sequence stretch
#
# - options:
#   -debug      [STD]
#
# - return val: - repaired sequence stretch (purified)
#               - undef if repair is impossible
#
# DESCRIPTION
# - the situation is that the sequence stretch (arg2) origins from the
#   original sequence (arg1) as a sequence range. But it's corrupt now
#   and shall be restored according to the original sequence.
# - Both sequence strings will be purified (incl. upper case letters)
#   before any operation.
#
# DEBUG, CHANGES, ADDITIONS
# - have a discussion about counting in computational or biological system.
#
sub SeqStretchRepair {
  my $RepairLenRel = 0.90;
  my $RepairLenAbs = 20;
  my ($pSeqOrig, $pSeqStretch, %opt) = @_;
  my ($debug, $dbg2);
  my ($SeqOrigPure, $SeqOrigLen, $SeqStretchPure, $SeqStretchLen);
  my ($pOlapPrim, %olap, %diff, $SeqRepair);

  # parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : 0;
  $SeqStretchPure = &SeqStrPure ($$pSeqStretch, -upper=>1);
  $SeqStretchLen  = length $SeqStretchPure;
  $SeqOrigPure    = &SeqStrPure ($$pSeqOrig, -upper=>1);
  $SeqOrigLen     = length ($SeqOrigPure);

  # look for end overlaps
  if ($debug) {
    printf STDERR "%s. looking for end overlaps\n", &MySub;
    print  STDERR "  stretch length: $SeqStretchLen\n";
    print  STDERR "  original length: $SeqOrigLen\n";
  }
  $pOlapPrim = &EndOverlap (\$SeqStretchPure, \$SeqOrigPure, -debug=>$dbg2);
  unless (@$pOlapPrim) {
    $debug and printf STDERR "%s. got no end overlap at all\n", &MySub;
    return undef;
  }

  # pick up beginning and end overlap
  foreach (@$pOlapPrim) {
    if (! $olap{strand} or $_->{strand} == $olap{strand}) {
      if (! $olap{beg} and $_->{offa} == 1) {
        $olap{strand} = $olap{strand} || $_->{strand};
        $olap{beg} = $_;
        next;
      }
      if (! $olap{end} and $_->{offa}+$_->{length}-1 == $SeqStretchLen) {
        $olap{strand} = $olap{strand} || $_->{strand};
        $olap{end} = $_;
        next;
      }
    }
  }
  if (! $olap{beg} and $olap{end}) {
    $olap{beg} = {
      strand => $olap{strand},
      offa   => 0,
      offb   => ($olap{strand} eq 'F') ?
                &Max (0, $olap{end}{offb}-$olap{end}{offa}) :
                &Min ($SeqOrigLen+1, $olap{end}{offb}+$olap{end}{length}-1+$olap{end}{offa}),
      length => 1,
      };
  } elsif (! $olap{end} and $olap{beg}) {
    $olap{end} = {
      strand => $olap{strand},
      offa   => $SeqStretchLen+1,
      offb   => ($olap{strand} > 0) ?
                &Min ($SeqOrigLen+1, $olap{beg}{offb}+$olap{beg}{length}) :
                &Max (0, $olap{beg}{offb}-($SeqStretchLen-($olap{beg}{offa}+$olap{beg}{length}-1))-1),
      length => 1,
      };
  } else {
    $debug and printf STDERR "%s. got no overlap starting at any of the stretch ends\n", &MySub;
    return undef;
  }
  if ($debug) {
    printf STDERR "%s. beginning overlap:\n", &MySub;
    printf STDERR "  rel. orientation %s\n", $olap{beg}{strand};
    printf STDERR "  length %d\n", $olap{beg}{length};
    printf STDERR "  stretch offset %d\n", $olap{beg}{offa};
    printf STDERR "  original offset %d %s\n", $olap{beg}{offb}, ($olap{strand} > 0) ? '' : "(in +1 sense)";
    printf STDERR "%s. end overlap:\n", &MySub;
    printf STDERR "  rel. orientation %s\n", $olap{end}{strand};
    printf STDERR "  length %d\n", $olap{end}{length};
    printf STDERR "  stretch offset %d\n", $olap{end}{offa};
    printf STDERR "  original offset %d %s\n", $olap{end}{offb}, ($olap{strand} > 0) ? '' : "(in +1 sense)";
  }

  # forcing strandedness to 'F'
  if ($olap{strand} < 0) {
    $debug and printf STDERR "%s. Forcing relative strandedness to '+1'\n", &MySub;
    $SeqOrigPure = &SeqStrRevcompl ($SeqOrigPure);
    foreach ($olap{beg}, $olap{end}) {
      $_->{offb}   = $SeqOrigLen - ($_->{offb}+$_->{length}-1) + 1;
      $_->{strand} = 1;
    }
  }

  # get difference interval
  # position values of $diff{'Pos...'} follow the informatical counting philosophy
  $diff{PosPureStretchAnte} = &Min ($olap{beg}{offa}+$olap{beg}{length}-1, $olap{end}{offa});
#  $diff{PosPureStretchAnte} = &Max ($diff{PosPureStretchAnte}, 0);
  $diff{PosPureStretchPost} = &Max ($olap{beg}{offa}+$olap{beg}{length}-1, $olap{end}{offa});
  $diff{PosPureStretchLen}  = $diff{PosPureStretchPost} - $diff{PosPureStretchAnte} - 1;
  if ($diff{PosPureStretchLen} < 0) {
    $diff{PosPureStretchPost} ++;
    $diff{PosPureStretchLen}  ++;
  }
  if ($olap{MoveEnd} = $diff{PosPureStretchPost} - $olap{end}{offa}) {
    $debug and printf STDERR "%s. moving end overlap by $olap{MoveEnd} bp\n", &MySub;
    $olap{end}{offa}   += $olap{MoveEnd};
    $olap{end}{offb}   += $olap{MoveEnd};
    $olap{end}{length} -= $olap{MoveEnd};
  }
  $diff{PosPureOrigAnte} = &Max ($olap{beg}{offb}-1+$diff{PosPureStretchAnte}, 0);
  $diff{PosPureOrigPost} = $olap{end}{offb};
  $diff{PosPureOrigLen}  = $diff{PosPureOrigPost} - $diff{PosPureOrigAnte} - 1;

  # get difference strings
  if ($$pSeqStretch =~ m/^((-*\w){$diff{PosPureStretchAnte}})((-*\w){$diff{PosPureStretchLen}}-*)/ ) {
    $diff{PosExtStretchAnte} = length $1;
    $diff{StretchDiff} = $3;
    $diff{PosExtStretchLen} = length $3;
    $diff{PosExtStretchFill} = $diff{PosExtStretchLen} - $diff{PosPureOrigLen};
  } else {
    printf STDERR "%s. program ERROR concerning RegExp\n", &MySub;
    if ($debug) {
      printf STDERR "%s. difference border values:\n", &MySub;
      print  STDERR "  stretch pos. left before:      $diff{PosPureStretchAnte}\n";
      print  STDERR "  stretch pos. right following:  $diff{PosPureStretchPost}\n";
      print  STDERR "  length for stretch:            $diff{PosPureStretchLen}\n";
      print  STDERR "  original pos. left before:     $diff{PosPureOrigAnte}\n";
      print  STDERR "  original pos. right following: $diff{PosPureOrigPost}\n";
      print  STDERR "  length for original:           $diff{PosPureOrigLen}\n";
    }
    return undef;
  }
  $diff{OrigDiff} = substr ($SeqOrigPure,
    $diff{PosPureOrigAnte}, $diff{PosPureOrigLen});

  # report differences
  if ($debug) {
    printf STDERR "%s. difference located between strand position %d and %d\n", &MySub,
      $diff{PosPureStretchAnte}, $diff{PosPureStretchPost};
    printf STDERR "  stretch  has: %s\n", $diff{StretchDiff}||"''";
    printf STDERR "  original has: %s\n", $diff{OrigDiff}||"''";
  }

  # critical point in changing
  if ($diff{PosExtStretchFill} < 0) {
    $debug and printf STDERR "%s. length of sequence stretch has changed\n", &MySub;
    $diff{PosExtStretchFill} = 0;
  }

  # repair stretch
  $SeqRepair = $$pSeqStretch;
  substr ($SeqRepair, $diff{PosExtStretchAnte}, $diff{PosExtStretchLen}) =
    $diff{OrigDiff} . ('-' x $diff{PosExtStretchFill});

  # exit SUB
  return $SeqRepair;
}


1;
# $Id: SeqCompOld.pm,v 1.6 2018/06/05 18:02:56 szafrans Exp $
