################################################################################
#
#  kPerl Sequence Laboratory
#  Library for RegExp-Based Sequence Motif Search
#    founded formerly in the vitamin D response element project
#    this code gets replaced by MotifIUPAC.pm
#  * obsolete *
#
#  copyright (c)
#    Institute of Molecular Biotechnology Jena, Dept. Genome Analysis, 1998-2002
#    Karol Szafranski and Inst. Physiol. Chem., Univ. Dsseldorf, 1997-1998
#    Tobias Schaefer and Inst. Physiol. Chem., Univ. Dsseldorf, 1998
#      (function &MtfProbab)
#  author
#    Tobias Schaefer
#    Karol Szafranski, karol.szafranski@leibniz-fli.de
#
################################################################################
#
# DESCRIPTION
#
# - individual description of functions at the beginning of the code blocks
#
# - dependencies:
#   - environment variable 'MOTIFPATH' contains default search paths
#     for motif library files. see MainLib::Path.pm for details.
#   - a standard restriction motif library is expected to reside as
#     $_LibParam{FileRestric} = $CorePath{motif}{restric}
#
################################################################################
#
# FUNCTION OVERVIEW
#
#   @EXPORT
#   %_LibParam (not exported)
#
# - motif library - I/O and manipulation
#   %_LibParam - files and paths
#   &LibMtfRead  old library interface, still used in SeqMotif.pl and SeqHandle.pl
#
# - motif library - restriction data specials
#   &MtfRestricCutstr
#   &LibRestricCheck
#   &LibRestricUpdate
#   &LibRestricScoreTuple
#
#
#  STD OPTIONS
#
#  -debug      print debug protocol to STDERR
#
################################################################################

package SeqLab::MotifRE;

# includes
use strict; #use warnings;  # OK 20061231
use MainLib::StrRegexp qw(&MatchFirstPos);
use MainLib::Data;
use MainLib::Path qw (%CorePath &PathExpand);
use MainLib::File qw(&ReadFile);
use MainLib::Misc qw(&MySub);
use Math::kCalc;
use database::DbPlain qw(&PlainToTable);
use SeqLab::SeqBench;
use SeqLab::MotifLib qw(&_LibMtfFLocate);
use SeqLab::MotifIUPAC qw(&MtfToSeqarr &MtfProbab);

# symbol export
use Exporter qw(import);
our @EXPORT = qw (
  &LibMtfRead
  &MtfRestricCutstr &LibRestricCheck &LibRestricUpdate
    &LibRestricScoreTuple
  );

# package-wide constants and variables
my %_LibParam;


################################################################################
# motif library - I/O and manipulation
################################################################################


$_LibParam{FileRestric} = $CorePath{motif}{restric};


# read motif definition file
# *** old library interface, still used in SeqMotif.pl and SeqHandle.pl
#
# INTERFACE
# - argument 1: path of file
#               - &_LibMtfFLocate localises a file in standard search paths
#               - if not given (undefined value!), it's set to the standard
#                 restriction enzyme file.
#
# - options:
#   -debug      [STD]
#
# - return val: - on success:
#                 - reference to array of motif data structures
#                   (= motif library data structure)
#                 - if wantarray:
#                   - reference to array of motif data structures
#                   - reference to motif ID index (hash)
#               - undef if an error occurs
#
# DESCRIPTION
# - The definition file data structure conforms the INI data syntax, cf.
#   &MainLib::Misc::DataRead.
#
sub LibMtfRead {
  my ($PathIn,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;
  $PathIn ||= $_LibParam{FileRestric};

  # read input file, parse data
  # no absolute path specification? => iterate through search paths
  my $FileMtf = &_LibMtfFLocate ($PathIn);
  unless ($FileMtf) {
    if ($debug) {
      printf STDERR "%s. no file matching to library argument %s found\n", &MySub,
        $PathIn||"''";
      &_LibMtfFLocate ($PathIn, -debug=>1);
    }
    return undef;
  }
  my $pMtf = &DataRead ($FileMtf, -debug=>$dbg2);
  unless ($pMtf and @$pMtf) {
    $debug and printf STDERR "%s. no data parsed from file %s\n", &MySub, $FileMtf||"''";
    return undef;
  } else {
    $debug and printf STDERR "%s. %d data entries parsed from file %s\n", &MySub, int @$pMtf, $FileMtf||"''";
  }

  # data check
  for (my $i=0; $i<@$pMtf; $i++) {
    $$pMtf[$i]{DefType} ||= 'RegExp';
    if ($$pMtf[$i]{DefType} !~ m/^(PWM|RegExp)$/ or (! $$pMtf[$i]{def})) {
      $debug and printf STDERR "%s. removing motif entry $i\n", &MySub;
      splice @$pMtf, $i, 1;
      $i --;
    }
  }
  $debug and printf STDERR "%s. %d motif%s found in library file %s\n", &MySub,
    int(@$pMtf), (@$pMtf==1) ? '':'s', $FileMtf||"''";

  # add ID index, return data
  if (wantarray) {
    my %MtfIndex = map { ($_->{id}, $_) } @$pMtf;
    return ($pMtf, \%MtfIndex);
  }

  # return data
  else { return $pMtf }
}


################################################################################
# motif library - restriction data specials
################################################################################


# enter cut string field into motif entry of a restriction motif library
#
# INTERFACE
# - argument 1: reference to motif library data structure
#
# - options:
#   -debug      [STD]
#
sub MtfRestricCutstr {
  my ($pMtflib,%opt) = @_;
  my $debug = $opt{-debug};

  # non-unique entries (purified motifs)?
  my %MtfRedund;
  foreach my $pMtf (@$pMtflib) {
    my $MtfDefPure = &SeqStrPure ($_->{def}, -upper=>1);
    $MtfDefPure =~ s/^N+//;
    $MtfDefPure =~ s/N+$//;
    push @{$MtfRedund{$MtfDefPure}}, $_;

    # derive cut motif
    my %MatchPos = (
      "'" => &MatchFirstPos ($$pMtf{def}, "'"),
      "," => &MatchFirstPos ($$pMtf{def}, ","),
      );
    $$pMtf{CutMotif}  = '-' x ($MatchPos{"'"} <= $MatchPos{","});
    $$pMtf{CutMotif} .= substr ($$pMtf{def}, 
      &Min ($MatchPos{","}, $MatchPos{"'"}) + 1,
      abs ($MatchPos{","} - $MatchPos{"'"}) - 1
      );
    $debug and printf STDERR "%s. entry %s, cut motif: %s => %s\n", &MySub,
      $$pMtf{id}, $$pMtf{def}, $$pMtf{CutMotif};
  }
}


# check restriction enzyme motif library
#
# INTERFACE
# - argument 1: path of old library file
#
# - options:
#   -debug      [STD]
#
sub LibRestricCheck {
  my ($PathLib,%opt) = @_;
  my $debug = $opt{-debug} || 1;
  my $dbg2  = $debug ? $debug-1 : undef;

  # load motif library
  my $pMtflibOld = &LibMtfRead ($PathLib, -debug=>$dbg2);
  unless ($pMtflibOld) {
    $debug and printf STDERR "%s. ERROR: no data parsed from motif library %s\n", &MySub, $PathLib||"''";
    return;
  }
  my @MtflibNonred = @$pMtflibOld;

  # non-unique entries (purified motifs)?
  my %MtfRedund;
  foreach (@MtflibNonred) {
    my $MtfDefPure = &SeqStrPure ($_->{def}, -upper=>1);
    $MtfDefPure =~ s/^N+//;
    $MtfDefPure =~ s/N+$//;
    push @{$MtfRedund{$MtfDefPure}}, $_;
  }
  %MtfRedund = map { ($_,$MtfRedund{$_}) }
    grep { int(@{$MtfRedund{$_}})>1 } keys %MtfRedund;

  # summary report
  if ($debug) {
    printf STDERR "%s. summary\n", &MySub;
    printf STDERR "  total entries: %d\n", int @MtflibNonred;
    printf STDERR "  non-unique entries: %d\n", int keys %MtfRedund;
    foreach my $MtfDefPure (keys %MtfRedund) {
      printf STDERR "    %s: %s\n", $MtfDefPure,
        join (', ', map { $_->{id} } @{$MtfRedund{$MtfDefPure}});
    }
  }

  # tabular listing
  foreach (@MtflibNonred) {
    printf "%s\t%s\n", $_->{id}, $_->{def};
  }
}


# compile restriction enyzme motif library from REBASE source
#
# INTERFACE
# - argument 1: path of old library file
# - argument 2: path of REBASE table file ('Rebase Files', format #32)
#
# - options:
#   -debug      [STD]
#
# - return val: success status (boolean)
#
# DESCRIPTION
# - updated motif data is printed to STDOUT
#
sub LibRestricUpdate {
  my ($PathLib,$PathRebase,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;

  # load old library
  my ($pMtflibOld,$pMtflibOldIdx) = &LibMtfRead($PathLib,-debug=>$dbg2);
  unless ($pMtflibOld) {
    $debug and printf STDERR "%s. ERROR: no data parsed from motif library %s\n", &MySub, $PathLib||"''";
    return 0;
  }

  # open REBASE file
  my $RebasePlain = &ReadFile($PathRebase);
  unless ($RebasePlain) {
    $debug and printf STDERR "%s. ERROR: unable to open REBASE file %s\n", &MySub, $PathRebase||"''";
    return 0;
  }

  # parse REBASE file
  my (@MtflibNonred,$CtNew);
  while ($RebasePlain =~ m/<1>(.*?)\n<2>(.*?)\n<3>(.*?)\n<4>(.*?)\n<5>(.*?)\n/g) {
    my %RebaseEntry = (
      id       => $1,
      DefType  => 'RegExp',
      def      => $3,
      isoschiz => $2 || '-',
      methyl   => $4 ? '+':'-',
      score    => ${ $$pMtflibOldIdx{$1}||{} }{score} || (1 / &MtfProbab($3)),
      source   => $5,
      );
    $RebaseEntry{ScoreCalc} = ${ $$pMtflibOldIdx{$1}||{} }{ScoreCalc} || $RebaseEntry{score};

    # there should be a source
    unless ($RebaseEntry{source}) {
      if ($debug) {
        printf STDERR "%s. no source for enzyme %s\n", &MySub, $RebaseEntry{id}||"''";
        if ($$pMtflibOldIdx{$RebaseEntry{id}}) {
          printf STDERR "  removing entry from motif library\n", $RebaseEntry{id}||"''";
        }
      }
      next;
    }
    delete $RebaseEntry{source};

    # remove syntax description entry
    if ($RebaseEntry{id}  =~ m/^<name>$/) { next }
    # skip methyl-selective entries
    if ($RebaseEntry{id}  =~ m/^(Acc65I|DpnI)$/) { next }
    # skip neoschizomer entries
    if ($RebaseEntry{id}  =~ m/^(BbeI|BssKI|BstF5I|Csp6I|Ecl136II|HinP1I|KasI|MlyI|NgoMIV|Ppu10I|PspGI|PspOMI|SfoI|TaiI|XmaI)$/) { next }
    # skip double-cutters
    if ($RebaseEntry{def} =~ m/^\(/) {
      $debug and printf STDERR "%s. skipping double-cutter enzyme %s\n", &MySub,
        $RebaseEntry{id};
      next; 
    }

    # reformat motif definition
    $RebaseEntry{def} =~ s/\^/'/;

    # expand asymmetric motif
    if ($RebaseEntry{def} =~ m|\((-?\d+)/(-?\d+)\)$| ) {
      if ($1 > 0 or $2 > 0) {
        $RebaseEntry{def} = $` . ('N' x &Max($1,$2));
      } else {
        $RebaseEntry{def} = $`;
      }
      my @site = ( {pos=>$1+length($`), val=>"'"}, {pos=>$2+length($`), val=>','} );
      # fill in cutting symbols from back to front
      foreach (sort { $b->{pos}<=>$a->{pos} or $b->{val} eq ',' } @site) {
        substr ($RebaseEntry{def}, $_->{pos}, 0) = $_->{val};
      }
    }

    # introduce reverse cut site
    unless ($RebaseEntry{def}=~m/,/) {
      if ($RebaseEntry{def}=~m/'/) {
        if (length($`) <= length($')) {
          substr ($RebaseEntry{def},
                  (-length($`)) || length($RebaseEntry{def}), 0) = ',';
        } else {
          substr ($RebaseEntry{def}, length($'), 0) = ',';
        }
      } else {
        printf STDERR "%s. WARNING: enzyme %s without cut sites\n", &MySub,
          $RebaseEntry{id};
        next; 
      }
    }

    # combine library entry and REBASE entry
    if ($$pMtflibOldIdx{$RebaseEntry{id}}) {
      foreach (grep { ! exists $RebaseEntry{$_} }
               keys %{$$pMtflibOldIdx{$RebaseEntry{id}}}) {  # 'comment', 'keyword', 'neoschiz') {
        $RebaseEntry{$_} = $$pMtflibOldIdx{$RebaseEntry{id}}{$_};
      }
      if ($debug) {
        printf STDERR "%s. updating existing entry %s\n", &MySub, $RebaseEntry{id};
        foreach ('def', 'isoschiz', 'score') {
          if ($$pMtflibOldIdx{$RebaseEntry{id}}{$_} ne $RebaseEntry{$_}) {
            printf STDERR "  field %s changed: %s => %s\n",
              $_, $$pMtflibOldIdx{$RebaseEntry{id}}{$_}||"''", $RebaseEntry{$_}||"''";
          }
        }
      }
      delete $$pMtflibOldIdx{$RebaseEntry{id}};
    } elsif ($debug) {
      printf STDERR "%s. adding entry %s\n", &MySub, $RebaseEntry{id}||"''";
      ++ $CtNew;
    }

    # enter motif
    push @MtflibNonred, { %RebaseEntry };
  }

  # derive cut shapes
  &MtfRestricCutstr (\@MtflibNonred, -debug=>$debug);

  # missing entries changed to isoschizomers?
  foreach my $pMtfMiss (values %$pMtflibOldIdx) {
    unless ($$pMtfMiss{id}) {
      printf STDERR "%s. CODE ERROR: undefined entry in motif index\n", &MySub;
      next;
    }
    $debug and printf STDERR "%s. checking entry %s for isoschizomer change\n", &MySub, $$pMtfMiss{id};
    my ($pMtfTgt) = grep { $_->{isoschiz}=~m/\b($$pMtfMiss{id})\b/ } @MtflibNonred;
    if ($pMtfTgt) {
      $debug and printf STDERR "%s. entry %s became an isoschizomer of %s\n", &MySub,
        $$pMtfMiss{id}, $$pMtfTgt{id};
      foreach (grep { ! exists $$pMtfTgt{$_} } keys %$pMtfMiss) {
        $$pMtfTgt{$_} = $$pMtfMiss{$_};
      }
      delete $$pMtflibOldIdx{$$pMtfMiss{id}};
      $CtNew --;
    }
  }

  # non-unique entries (purified motifs)?
  # NOTE: entries may be neoschizomers (differing in cut product rather than recognition motif)
  my %MtfRedund;
  foreach (@MtflibNonred) {
    my $MtfDefPure = &SeqStrPure ($_->{def}, -upper=>1);
    $MtfDefPure =~ s/^N+//;
    $MtfDefPure =~ s/N+$//;
    push @{$MtfRedund{$MtfDefPure}}, $_;
  }
  %MtfRedund = map { ($_,$MtfRedund{$_}) }
    grep { int(@{$MtfRedund{$_}})>1 } keys %MtfRedund;

  # summary report
  if ($debug) {
    printf STDERR "%s. update summary\n", &MySub;
    printf STDERR "  entries deleted: %d%s%s\n",
      int keys %$pMtflibOldIdx,
      (keys %$pMtflibOldIdx) ? ' = ':'', join (', ', keys %$pMtflibOldIdx);
    printf STDERR "  new entries: %d\n", $CtNew;
    printf STDERR "  total entries: %d\n", int @MtflibNonred;
    printf STDERR "  non-unique entries: %d\n", int keys %MtfRedund;
    foreach my $MtfDefPure (keys %MtfRedund) {
      printf STDERR "    %s: %s\n", $MtfDefPure,
        join (', ', map { "$_->{id} (cut $_->{CutMotif})" } @{$MtfRedund{$MtfDefPure}});
    }
  }

  # output data, return success status
  &DataPrint (\@MtflibNonred, -NoAddr=>1);
  return 1;
}


# update/enter scores in motif library according to tuple library
#
# INTERFACE
# - argument 1: path of motif library file
# - argument 2: tuple table filename stem for update
# - argument 3: field label for new score entry in the motif library
#
# - options:
#   -debug      [STD]
#
# DESCRIPTION
# - updated motif data is printed to STDOUT
#
sub LibRestricScoreTuple {
  my $TupleSizeMax = 8;
  my ($PathMotif,$StampTuple,$ScoreLabel,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;

  # load motif library
  my $pMtfLib;
  unless ($pMtfLib = &LibMtfRead ($PathMotif, -debug=>$dbg2)) {
    $debug and printf STDERR "%s. ERROR: no data parsed from motif library file %s\n", &MySub,
      $PathMotif||"''";
    return undef;
  }

  # loop over tuple size
  foreach my $TupleSize (4..8) {

    # load tuple table
    my $PathTable = $StampTuple . ".tab${TupleSize}";
    my $pTupleTable = &PlainToTable ($PathTable, -TabType=>'HIA', -debug=>$dbg2);
    unless ($pTupleTable and %$pTupleTable) {
      $debug and printf STDERR "%s. ERROR: got no tuple table data from file %s\n", &MySub,
        $PathTable||"''";
      return undef;
    }

    # loop over purified motif definitions
    foreach my $pMtf (@$pMtfLib) {
      my $MtfDefPure = &SeqStrPure ($$pMtf{def}, -upper=>1);
      $MtfDefPure =~ s/^N+//;
      $MtfDefPure =~ s/N+$//;

      # are tuples of this size available in the tuple library
      if ($TupleSize == $TupleSizeMax and length($MtfDefPure) > $TupleSize) {
        my $TupleSizeDiff = length($MtfDefPure) - $TupleSize;
        my $i = ($MtfDefPure =~ s/N{$TupleSizeDiff}$//);
        printf STDERR "%s. shrinking motif %s/%s, success: %d\n", &MySub,
          $$pMtf{id}, $$pMtf{def}, $i;
      }
      unless (length $MtfDefPure == $TupleSize) { next }
      $debug and printf STDERR "%s. working on motif %s, %d combinations\n", &MySub,
        $MtfDefPure||"''", int (&MtfToSeqarr ($MtfDefPure, -strands=>1));

      # sample tuple scores for motif instances
      # - tuple library is expected to yield the needed information for either
      #   single-stranded or double-stranded sequences. This way, &MtfToSeqarr
      #   needs to calculate only forward-oriented instances of the motif (switch 
      #   -strands => 1).
      my $MtfScore;
      foreach (&MtfToSeqarr ($MtfDefPure, -strands=>1)) {
        my $TupleScore = $$pTupleTable{$_}[1];
        unless ($TupleScore) {
          $debug and printf STDERR "%s. no score found for tuple %s\n", &MySub, $_||"''";
          next;
        }
        $MtfScore += $TupleScore;
      }

      # enter total tuple score for motif
      # recalculate:
      # - motif score is the mean distance of motif instances on
      #   double-stranded DNA (= reverse of expected rel. frequency).
      if ($MtfScore) {
        $$pMtf{$ScoreLabel} = int (1 / $MtfScore);
      } else {
        $$pMtf{$ScoreLabel} = 0;
      }
      $debug and printf STDERR "  score ($ScoreLabel): %d\n", $$pMtf{$ScoreLabel};

      # calculated score in case of normal distribution, just for comparison
      $$pMtf{ScoreCalc} = 1 / &MtfProbab ($MtfDefPure, -strands=>0);
    }
  }

  # output motif library
  &DataPrint ($pMtfLib);
}


1;
# $Id: MotifRE.pm,v 1.16 2018/06/05 18:02:56 szafrans Exp $
