################################################################################
#
#  kPerl Sequence Laboratory
#  Object Library for Sequence Motif Library
#
#  copyright (c)
#  Karol Szafranski on behalf of IMB Jena, Genome Analysis, 2002,2004,
#    szafrans@imb-jena.de
#
################################################################################
#
#  DESCRIPTION
#
# - purpose: handle bunch of motif definitions and allow batch motif search
#   independent of motif format types
#
# - environment variable 'MOTIFPATH' contains default search paths
#   for motif library files. see MainLib::Path.pm for details.
#
# - individual description of functions can be found at the beginning of the
#   code blocks
#
################################################################################
#
#  OBJECT OPERATORS
#
#   *** NONE ***
#
#
#  OBJECT METHODS  for external access
#
# - housekeeping and file I/O
#   new           create object, see MainLib::DefaultObjHash.pm
#                 internally calls ini()
#   ini           initialise object with motif definitions from files
#                 internally calls Load()
#   AddSwitch     modify object options
#   Push          add motif object
#   Load          load (add) motif definitions from file(s)
#   Paths         array of referenced paths
#   IDs           array of motif IDs
#   Size          number of motif objects in library
#
# - functionality
#   Search        return hits for search on sequence using motifs in library
#   Statist       output statistics - some debugging function
#
#
#  OBJECT DATA STRUCTURE
#  (hash)
#
#   motif         reference on array of motif objects
#   path          array of referenced file paths
#   SeqScanned    count of scanned sequences
#   SeqHits       count of hits in scanned sequences
#   switch        hash reference for object switches, cf. method AddSwitch()
#     -debug        print debug protocol to STDERR
#     -SlcID        select for motif ID for all actions
#
################################################################################
#
#  FUNCTIONS, DATA
#
#   @ISA
#   %_LibParam
#
# - housekeeping and file I/O
#   &new  see MainLib::DefaultObjHash.pm
#   &ini
#   &AddSwitch
#   &_LocalSwitch  see MainLib::DefaultObjHash.pm
#   $_LibParam{parse}  file parsing manifold
#   &Load
#   &Paths
#   &IDs
#   &Size
#
# - search
#   motif search result data struture
#   &Search
#   &Statist
#
# - non-method elementary functions
#   $_LibParam{PathLib}  motif search paths
#   $_LibParam{FileRestric}  not in use
#   &_LibMtfFLocate
#   &_LibMtfFType
#
#
#  STD OPTIONS
#
#  -debug      print debug protocol to STDERR
#
################################################################################
package SeqLab::MotifLib;

# includes
#use strict; use warnings;  # OK 20040805
use MainLib::Data;
use MainLib::DefaultObjHash;
use MainLib::Path qw (%CorePath &PathExpand);
use MainLib::Misc qw(&MySub);
use SeqLab::SeqBench qw(&SeqStrPure);

# inheritance
our @ISA;
push @ISA, qw(MainLib::DefaultObjHash);

# symbol export
# - only for non-method functions
# - no default export
use Exporter;
push @ISA, qw(Exporter);
our @EXPORT_OK = qw (&_LibMtfFLocate);

# package-wide constants and variables
my %_LibParam;


################################################################################
# housekeeping and file I/O
################################################################################


# parametric initialisation
#
# INTERFACE
# - argument 1*: path(s) of sequence input file(s)
# - options:     aren't accepted here! Use $this->AddSwitch()
# - return val:  object reference
#
# DESCRIPTION
# - the object may be initialised with a NULL list of args.
#   This way it's possible to set object switches (method AddSwitch())
#   prior to load actions.
#
sub ini {
  my ($this, @arg) = @_;

  # initialise object, keep it blessed
  %$this = ();

  # fill object by loading the specified motif definitions
  $this->Load(@arg);

  return $this;
}


# enter object switches
#
# INTERFACE
# - argument 1*: hash of switches
# - return val:  success status (boolean)
#
sub AddSwitch {
  my ($this, %oopt) = @_;
  my ($bErr);

  # loop over switches
  while (my($key,$val) = each(%oopt)) {
    if (0) {}

    # delegate -debug to motif objects
    elsif ($key eq '-debug' and ($val||0)>1) {
      foreach my $pMtf (@{$this->{motif}}) {
        $bErr ||= ! $pMtf->AddSwitch($key=>$val-1,-TmpPreserve=>1);
      }
    }

    if (0) {}

    # enter option -SlcID
    # - selector enbodied by array of IDs is not supported
    elsif ($key eq '-SlcID' and ref($val) eq 'ARRAY') { next }

    #options that we just have to enter
    else {
      if (defined $val) { $this->{switch}{$key} = $val; }
      else       { delete $this->{switch}{$key}; }
    }
  }

  # return success status
  return !$bErr;
}


# add motif object(s)
#
# INTERFACE
# - argument 1*: reference to motif object(s)
# - options:     cannot be set here, use object switches
# - return val:  number of successfully added motif objects
#
sub Push {
  my ($this, @ArgMtf) = @_;
  my $debug = $this->{switch}{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;

  # preparations/modifications according to options of motif library
  if ($dbg2) {
    foreach my $pMtf (@ArgMtf) { $pMtf->AddSwitch(-debug=>$dbg2) }
  }

  # enter source path and motif objects
  push @{$this->{motif}}, grep{$_->Valid()} @ArgMtf;

  # enter source path(s)
  # - currently, the motif objects do not contain information on their
  #   source file
  #push @{$this->{path}}, $SrcPath;

  # return reference on data structure
  return int (grep{$_->Valid()} @ArgMtf);
}


# load motif definitions from file - format-specific
#
# INTERFACE
# - argument 1: path of file
# - options:
#   all possible object switches. However, as this is an abstract interface, it
#   makes sense to use only those object switches that are common to all motif
#   objects.
# - return val: array of references to loaded motif data structures
#
$_LibParam{parse} = {
  HMMer => sub {
      require SeqLab::MotifHMMer;
      return SeqLab::MotifHMMer->new(@_);
    },
  IUPACstruct => sub {
      require SeqLab::MotifIUPAC;
      my (@mtf);
      my $paData = &DataRead ($_[0]);
      foreach my $pMtfData (@$paData) {
        push @mtf, SeqLab::MotifIUPAC->new($pMtfData);
      }
      return @mtf;
    },
  RunPWM => sub {
      require SeqLab::MotifRunPWM;
      return SeqLab::MotifRunPWM->new(@_);
    },
  PWMscan => sub {
      require SeqLab::MotifPWMscan;
      return SeqLab::MotifPWMscan->new(@_);
    },
  };


# load motif definitions from file - format-independent
#
# INTERFACE
# - argument 1*: path(s) of file(s)
# - options:     cannot be set here, use object switches
# - return val:  number of successfully loaded motif data structures
#
# DESCRIPTION
# - Note that the argument convention is different from specific Motif*
#   objects.
#
sub Load {
  my ($this,@ArgSrc) = @_;
  my $debug = $this->{switch}{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;

  # read motifs from files
  my ($MtfNum);
  foreach my $ItSrc (@ArgSrc) {

    # analyse source argument
    my $SrcPath = &_LibMtfFLocate ($ItSrc, -debug=>$dbg2);
    unless (defined $SrcPath) {
      printf STDERR "%s. WARNING: unable to locate file for motif source %s\n", &MySub, $ItSrc||"''";
      next;
    }
    my $SrcType = &_LibMtfFType ($SrcPath, -debug=>$dbg2);
    unless (defined $SrcType) {
      printf STDERR "%s. WARNING: motif file %s has undeterminable format type\n", &MySub, $SrcPath||"''";
      next;
    }
    $debug and printf STDERR "%s. loading motif source %s, type %s\n", &MySub,
      $SrcPath||"''", $SrcType||"''";

    # load motifs
    my @mtf = &{$_LibParam{parse}{$SrcType}} ($SrcPath, -debug=>$dbg2);
    my $MtfAdded = int (@mtf);
    $MtfNum += $MtfAdded;
    $debug and printf STDERR "%s. loaded %d motif%s from file %s\n", &MySub,
      $MtfAdded, ($MtfAdded==1)?'':'s', $SrcPath||"''";

    # on successful load
    if ($MtfAdded) {

      # final preparations/modifications on motif objects
      if ($dbg2) {
        foreach my $pMtf (@mtf) { $pMtf->AddSwitch(-debug=>$dbg2) }
      }

      # enter source path and motif objects
      push @{$this->{motif}}, @mtf;
      push @{$this->{path}}, $SrcPath;
    }
    else {
      printf STDERR "%s. WARNING: no motifs found in file %s\n", &MySub, $SrcPath||"''";
    }
  }

  # return reference on data structure
  return $MtfNum;
}


# return array of referenced paths
#
# INTERFACE
# - return val: array of referenced paths
#
sub Paths {
  my ($this) = @_;
  return @{$this->{path}};
}


# return array of motif IDs
#
# INTERFACE
# - return val: array of referenced paths
#
sub IDs {
  my ($this) = @_;

  my (@ID);
  foreach my $pMtf (@{$this->{motif}}) {
    push @ID, $pMtf->ID() || ();
  }

  return @ID;
}


# return number of motif objects in library
#
# INTERFACE
# - return val: number of motif objects in library
#
sub Size {
  my ($this) = @_;
  return int (@{$this->{motif}});
}


################################################################################
# search
################################################################################


# motif search result data structure
#
# @result        array of %instance hashes
#
# %instance      hash with fields:
#   MotifID      motif identifier
#   orient       match orientation, possible values:
#                -1  inverted
#                 1  direct
#                 0  undefined or symmetric
#   offset       match offset (= sequence position of left border of match,
#                irrespective of match orientation)
#   offdef       match offset for motif definition
#                This may differ from 'offset' if a ...
#   length       instance width (typically motif definition width, but may
#                differ in motif types that're flexible for InDels)
#   score        instance score
#   ScoreExp     instance expectancy, mostly defined as probability of hit per
#                possible site
#   instance     instance (sequence string) of search pattern
#   ante         sequence string preceding hit
#   post         sequence string following hit
#


# find instances of search motifs in given sequence
#
# INTERFACE
# - argument 1: query sequence string
#
# - options:
#   -debug      [STD]
#   -HitSurrd   lengths of stored surrounding sequences, default: 0
#   -SeqID      provide sequence ID, just for convenience
#   -strands    which strand to search
#               0   both strands (default)
#               -1  minus strand only
#               1   plus strand only
#
# - return val: - reference to result data structure
#               - undef if an error occurred
#
# DESCRIPTION
# - the query sequence string will be purified prior to search
#
sub Search {
  my ($this, $sSeqQuery, %opt) = @_;
  my %lopt = $this->_LocalSwitch(%opt);
  my $debug = $lopt{-debug}; delete $lopt{-debug};
  my ($pMtf, $pMtfHits, @HitSum);

  # pre-check
  unless (exists($this->{motif}) and $this->{motif} and @{$this->{motif}}) { return []; }

  # prepare search sequence
  $sSeqQuery = &SeqStrPure ($sSeqQuery, -upper=>1);
  unless (length ($sSeqQuery)) {
    printf STDERR "%s. WARNING: empty sequence string - skipped\n", &MySub;
    return [];
  }

  # perform search
  foreach $pMtf (@{$this->{motif}}) {
    if ($lopt{-SlcID} and $pMtf->ID()!~m/$lopt{-SlcID}/) { next }
    $pMtfHits = $pMtf->Search ($sSeqQuery, %lopt, -isPure=>1);
    if (defined $pMtfHits) {
      push @HitSum, @$pMtfHits;
      $this->{SeqHits} += int @$pMtfHits;
    } else {
      printf STDERR "%s. ERROR: got undef from %s->Search()\n", &MySub, ref($pMtf);
      return undef;
    }
  }
  $this->{SeqScanned} ++;

  # return result
  return \@HitSum;
}


# output statistics
#
# INTERFACE
# - argument 1*: output handle, default STDOUT
# - options:     cannot be set here, use object switches
#
sub Statist {
  my ($this,$hOutTab) = @_;
  if (not defined($hOutTab)) { $hOutTab=\*STDOUT }

  # report
  printf $hOutTab "%s. motif library statistics\n", &MySub;
  printf $hOutTab "  motifs: %d\n", int(@{$this->{motif}||[]});
  my %MtfType = map{($_,1)} map{ $_->DefType() } @{$this->{motif}};
  return;
  printf $hOutTab "  motif DefTypes: %d - %s\n",
    int(keys(%MtfType)), join(' ',keys(%MtfType));
  printf $hOutTab "  sequences scanned: %d\n", $this->{SeqScanned};
  printf $hOutTab "  motif hits: %d\n", $this->{SeqHits};
}


################################################################################
# search
################################################################################


$_LibParam{PathLib} = [ '.', @{$CorePath{motif}{lib}} ];
$_LibParam{FileRestric} = $CorePath{motif}{restric};

# locate motif definition file according to search paths
#
# INTERFACE
# - argument 1: path statement for motif file
#
# - options (only function-local):
#   -debug      [STD]
#
# - return val: - motif file path
#               - undef if an error occurs
#
# DESCRIPTION
# - A motif file without a directory specified is automatically searched
#   in the search paths specified in @{$_LibParam{PathLib}}.
#
sub _LibMtfFLocate {
  my ($MtflibSpec, %opt) = @_;
  my (@PathSrch, $MtfFile);

  # read input file, parse data
  # no absolute path specification? => iterate through search paths
  if ($MtflibSpec =~ m|/|) {
    @PathSrch = ('');
    $MtflibSpec = &PathExpand ($MtflibSpec);
  } else {
    @PathSrch = map { "$_/" } @{$_LibParam{PathLib}};
  }
  $opt{-debug} and printf STDERR "%s. search paths:\n%s", &MySub,
     join('',map{ "  $_\n" }@PathSrch);
  $MtfFile = (
    grep { -f $_ }
    map { ("$_$MtflibSpec", "$_$MtflibSpec.mtf") } @PathSrch
    )[0];
    # we cannot construct the path with linker "/" because the search path
    # may be a null-string
  $opt{-debug} and printf STDERR "%s. located motif file %s\n", &MySub, $MtfFile;

  # return data
  return $MtfFile;
}


# determine format of motif definition file
#
# INTERFACE
# - argument 1: path of file
#
# - options (only function-local):
#   -debug      [STD]
#
# - return val: - source file format
#               - undef if format was not determined or an error occured
#
sub _LibMtfFType {
  my ($MtfFile,%opt) = @_;
  my $debug = $opt{-debug};

  # read 1st non-comment line
  unless (open (INMTF,$MtfFile)) {
    printf STDERR "%s. ERROR: unable to read file %s\n", &MySub, $MtfFile||"''";
    return undef;
  }
  my ($line,$Line1stComm);
  while (<INMTF>) {
    chomp;
    if (m/^#/) {
      $Line1stComm ||= $_;
    } elsif ($_ eq '') {
      next;
    } else {
      $line = $_;
      $debug and printf STDERR "%s. first non-comment line:\n%s\n", &MySub, $line;
      last;
    }
  }
  close (INMTF);
  if (! defined $line) { return undef }

  # determine format
  if ($line =~ m/^\s*(ARRAY)?\{\s*$/) {
    return 'IUPACstruct';
  }
  if ($line =~ m/^HMMER/i) {
    return 'HMMer';
  }
  if ($line =~ m/^PWMsuite/i) {
    return 'PWMscan';
  }
  if ($line =~ m/^\s*\d+\s+\d+\s+[\d.-]+/) {
    return 'RunPWM';
  }

  # format undetermined
  return undef;
}


1;
# $Id: MotifLib.pm,v 1.13 2006/08/21 21:05:14 sza Exp $
