################################################################################
#
#  kPerl Sequence Laboratory
#  Object Library for Sequential Sequence File Input
#
#  copyright (c)
#    Institute of Molecular Biotechnology Jena, Dept. Genome Analysis, 1999-2004
#  author
#    Karol Szafranski, karol.szafranski@leibniz-fli.de
#
################################################################################
#
#  DESCRIPTION
#
# - purpose:
#   this class is a sequential sequence input interface for sequence file
#   formats that're available via functions referenced in
#   $SeqLab::SeqFormat::SeqFFmtGet
#   Additionally, it provides access to contig consensus sequences of a GAP4
#   project. The according Tcl script to export consensus sequences should
#   be callable via $CorePath{call}{GapExportCons}.
#
# - individual description of functions can be found at the beginning of the
#   code blocks
#
################################################################################
#
#  OBJECT OPERATORS
#
#   *** NONE ***
#
#
#  OBJECT METHODS  for external access
#
# - housekeeping
#   new           create object, initialize via ini(@args)
#                 @args is a variable-length list of input files or input pipes.
#                 Argument "-" works as an alias to STDIN. Input handles are not
#                 allowed.
#   ini           (re-)initialize file queue with given list of files. This is
#                 called by new() in order to initialize the object but it may
#                 be used from outside also. However, as it has the same effect
#                 like creating a new object it will be more transparent to
#                 explicitly call new() instead of ini().
#   AddSwitch     modify object options
#   Clear         clear file queue
#                 Object options are preserved.
#   Push          add elements at the end of the file queue
#
# - functionality
#   GetNext       get sequence entry from queue
#
#
#  OBJECT DATA STRUCTURE
#  (hash)
#
#    handle       current input file handle, created internally, is a
#                 FileHandle::Unget.
#    NmbGetNext   number of calls to method GetNext
#    NmbRead      number of returned sequences
#    PathSrc      reference on array of input paths
#    PathTmp      path of temporary file
#    SeqBufferRaw
#                 reference to array of buffered sequences, raw from input
#    SeqBuffer    reference to array of buffered sequences, selected and
#                 processed
#    SrcCurr      hash giving information about the current source file
#                   path      file path (may be temporary file)
#                   opath     original file path (or handle)
#                   SrcMulti  file already turned out to be multiple-sequence
#                             containing
#                   type      sequence file format type
#    switch       hash reference for object switches, cf. method AddSwitch()
#     -ClipQuality  return only quality range of sequence string (for
#                   Experiment file format). This switch doesn't take effect
#                   if switch -fast is set.
#     -ClipUnk      clip unknown symbols at the beginning/end of the sequence
#                   string
#     -debug        print debug protocol to STDERR
#     -fast         read sequence input quick and dirty. This may
#                   take effect file format: Experiment, GenBank, [...?].
#     -FilterDescr  filter by sequence description
#     -FilterID     apply given Regexp to sequence identifier to filter off
#                   sequence entries
#     -lower        force sequence string to lower case letters
#     -MatchID      select by matching to sequence identifiers. Selector may be:
#                     ARRAY ref list of sequence identifiers (not regexps!).
#                               Internally converted to hash, aiming at faster
#                               performance.
#                     HASH ref  keyed list of sequence identifiers (not
#                               regexps!)
#     -pure         purify sequence strings for sequence-coding letters
#                   You may specify a sequence type. Then, fuzzy letters are
#                   converted to official 'unknowns'.
#                   Positional references are adjusted if needed.
#                   Upper/lower-case appearance of the sequence string remains
#                   unchanged.
#     -SlcDescr     select by sequence description
#     -SlcEnds      return only the sequence ends having the specified length
#                   of N letters. If the sequence is shorter than two times
#                   this value the complete sequence will be returned.
#                   The returned sequence has a purified sequence string and
#                   additionally, the purified sequence string will be stored
#                   in a field labelled 'SeqPure'.
#     -SlcID        select by regexp apllied to sequence identifier
#     -SlcLen       select sequences by means of their length, reference to
#                   range object.
#                   - The purified sequence string will be stored in a field
#                     labelled 'SeqPure'.
#                   - Selection will be done after application of -ClipQual
#                     and/or -ClipUnk.
#     -SlcType      select by sequence type
#     -TmpPreserve  do not unlink temporary files, propagate option -preserve
#                   to global manager of temporary files
#     -upper        force sequence string to upper case letters
#
#
#  FUNCTIONS, DATA
#
#   @ISA
#   %_LibParam
#
# - basics
#   $_LibParam{TmpManag}
#   &new  see MainLib::DefaultObjHash.pm
#   &ini
#   &AddSwitch
#   &_LocalSwitch  see MainLib::DefaultObjHash.pm
#   &Clear
#   &Push
#   &_RmTmp
#   &DESTROY
#
# - functionality
#   &GetNext
#
################################################################################
#
#  DEBUG, CHANGES, ADDITIONS
#
# - using -SlcEnds you'll loose any annotation information
#
# - look also for notes in the header of each function block
#
################################################################################

package SeqLab::SeqStreamIn;

# includes
use strict; #use warnings;  # OK 20040818
use FileHandle::Unget;
use MainLib::DefaultObjHash;
use MainLib::Data qw(&DataClone &DataPrint);
use MainLib::Path;
use MainLib::FileTmp;
use MainLib::Misc qw(&MySub);
use Math::kCalc qw(&Sum);
use SeqLab::SeqBench;
use SeqLab::SeqFormat qw(%SeqFFmtGet &SeqFileFormat);

# inheritance
our @ISA;
push @ISA, qw(MainLib::DefaultObjHash);

# package-wide constants and variables
my %_LibParam;


################################################################################
# housekeeping
################################################################################


# handle temporary files globally
# - encapsulation in a sub{} allows MainLib::FileTmp to identify the calling
#   package
my $pcFT = sub{ $_LibParam{TmpManag} = MainLib::FileTmp->new(); };
&$pcFT;


# parametric initialization
#
# INTERFACE
# - argument 1*: path(s) of sequence input file(s)
# - options:     aren't accepted here! Use $this->AddSwitch()
# - return val:  object reference
#
sub ini {
  my ($this, @arg) = @_;

  # initialize object, keep it blessed
  %$this = ();

  # initialize file queue and fill it with arguments
  $this->Push(@arg);

  # return
  return $this;
}


# enter object switches
#
# INTERFACE
# - argument 1*: hash of switches
# - return val:  success status (boolean)
#
sub AddSwitch {
  my ($this,%oopt) = @_;
  my ($bErr);

  # function parameters
  my $debug = $this->{switch}{-debug} || $oopt{-debug};

  # loop over switches
  while (my($key,$val) = each(%oopt)) {
    if (0) {}

    # delegate -debug to "MainLib::FileTmp"
    elsif ($key eq '-debug' and ($val||0)>1) {
      $bErr ||= ! $_LibParam{TmpManag}->AddSwitch($key=>$val-1,-preserve=>1);
    }
    # delegate -TmpPreserve to "MainLib::FileTmp"
    elsif ($key eq '-TmpPreserve') {
      $bErr ||= ! $_LibParam{TmpManag}->AddSwitch(-preserve=>$val);
    }

   # from here on, each case block exclusively defines the action of object
   # re-shaping associated with that switch
    if (0) {}

    # enter option -MatchID
    elsif ($key eq '-MatchID' and ref($val) eq 'ARRAY') {
      $debug and printf STDERR "%s. option -MatchID with ARRAY of ID selectors (%d entr%s)\n", &MySub,
        int(@$val), (int(@$val)==1)?'y':'ies';
      $this->{switch}{-MatchID} = { map{($_=>1)} @$val };
    }

    # enter option -SlcLen
    elsif ($key eq '-SlcLen' and $val) {

      # range object
      if (ref($val) eq 'Math::Range') {
        $this->{switch}{-SlcLen} = $val;
      }

      # string syntax
      elsif (! ref($val)) {
        $val =~ s/\s+//g;

        # old syntax - this may be obsolete soon
        $val =~ s/^>=//g;
        if ($val =~ m/^([<>](=)?)(\d+)$/) {
          if (substr ($1, 0, 1) eq '<') {
            $val = '1..' . ($2 ? $3 : $3-1);
          } else {  # have '>\d+'
            $val = $3 + 1;
          }
        }
        if ($val =~ m/^(\d*)-(\d+)$/ or $val =~ m/^(\d+)-(\d*)$/) {
          if ($1 and $2 and $2 < $1) {
            printf STDERR "%s. syntax ERROR in string argument '%s', option -SlcLen\n", &MySub, $val;
            $bErr = 1; next;
          }
          $1 and $val = $1;
          $2 and $val .= "..$2";
        }

        # current syntax
        if ($val =~ m/^\d+(\.\.\d+)?$/) {
          require Math::Range;
          $this->{switch}{debug} and printf STDERR "%s. argument of option -SlcLen\n  %s", &MySub;
          $this->{switch}{debug} and &DataPrint($val,-handle=>\*STDERR);
          $this->{switch}{-SlcLen} = Math::Range->new_parsed($val);
          unless ($this->{switch}{-SlcLen}) {
            printf STDERR "%s. ERROR: construction of range object failed\n  option: -SlcLen\n  string argument: %s\n", &MySub, $val;
            $bErr = 1; next;
          }
        }

        # syntax error
        else {
          printf STDERR "%s. ERROR: strange argument '%s', option -SlcLen\n", &MySub, $val;
          $bErr = 1; next;
        }
      }

      # old syntax - this may be obsolete soon
      else {
        printf STDERR "%s. ERROR: strange argument (ref. %s) for option -SlcLen\n", &MySub, ref($val);
        $bErr = 1; next;
      }
    }

    #options that we just have to enter
    else {
      if (defined $val) { $this->{switch}{$key} = $val; }
      else       { delete $this->{switch}{$key}; }
    }
  }

  # return success status
  return !$bErr;
}


# clear file queue
#
sub Clear {
  my ($this) = @_;

  # close handles, delete temporary files
  delete $this->{handle};
  $this->_RmTmp();

  # initialize anything but object switches
  foreach (grep{ !/^switch$/ } keys %{$this}) {
    delete $this->{$_};
  }
  # leave object switches
}


# unlink temporary file if there is one
#
sub _RmTmp {
  my ($this) = @_;
  my $bGo = (!$this->{switch}{-debug} or !$this->{switch}{-TmpPreserve});
  if (exists($this->{PathTmp}) and $this->{PathTmp} and $bGo) {
    unlink $this->{PathTmp};
    delete $this->{PathTmp};
  }
}


# add files at the end of the queue
#
# INTERFACE
# - argument 1+: path(s) of sequence input file(s)
# - options:     aren't accepted here! Use $this->AddSwitch()
#
sub Push {
  my ($this, @arg) = @_;

  # append paths to array of input paths
  push @{$this->{PathSrc}}, @arg;
}


# object's tidy up
#
sub DESTROY {
  my ($this) = @_;
  # delete temporary files (for previous queue entries)
  $this->_RmTmp();
}


################################################################################
# functionality
################################################################################


# read next sequence entry from input queue
#
# INTERFACE
# - options:    all object switches may be set here to last during
#               the function call. Note that -SlcLen only works if
#               specified as reference to range object.
# - return val: - reference to sequence array data structure
#               - undef if queue is empty or an error occurs
#
sub GetNext {
  my ($this,%opt) = @_;
  my ($pFunc, $call, $ret);
  my ($pSeq, $iSeqLen, $RegexpUnk);

  # function parameters
  my %lopt = $this->_LocalSwitch(%opt);
  my $debug = $lopt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;

  # function entry, early debug
  $this->{NmbGetNext} ++;
  $debug and printf STDERR "%s. call no. %d\n", &MySub, $this->{NmbGetNext};

  # retry block
  # - this is needed for selection features and it's better than
  #   doing recursive calls (which may cause memory overkill)
  # - first: look for sequence entries which may be buffered in 
  #   @{$this->{SeqBuffer}} or @{$this->{SeqBufferRaw}}.
  GetNextRetry: {
    undef $pSeq;
    undef $iSeqLen;

    # any sequence in the buffer (processed entries)? => return it
    if ($pSeq = shift @{$this->{SeqBuffer}}) {
      $debug and printf STDERR "%s. getting processed sequence from queue: %s\n", &MySub, $$pSeq{id}||"''";
      return $pSeq;
    }

    # any sequence in the buffer (raw entries)? => break on to selection/modification
    if ($pSeq = shift @{$this->{SeqBufferRaw}}) {
      $debug and printf STDERR "%s. getting raw sequence from queue: %s\n", &MySub, $$pSeq{id}||"''";
    }
    
  ##############################################################################
  # force $this->{handle} being valid for input of (next) sequence entry
  # get next file path, do pre-work & parsing
    else {

      # ensure $this->{handle} pointing to the next sequence entry
      if (!defined($this->{handle}) or eof($this->{handle})) {
        if (defined $this->{handle}) { undef $this->{handle} }

        # delete temporary files (for previous queue entries)
        $this->_RmTmp();

        # get the next input path
        if (my $PathSrc = shift @{$this->{PathSrc}}) {
          $debug and printf STDERR "%s. turning to next file on queue: %s\n", &MySub, $PathSrc||"''";
          $this->{SrcCurr} = {};

          # source-specific input source handling
          # - STDIN
          if ($PathSrc eq '-') {
            $debug and printf STDERR "%s. buffered input from STDIN\n", &MySub;
            $this->{SrcCurr}{path} = $PathSrc;
            $this->{SrcCurr}{opath} = 'stdin';
            $this->{handle} = FileHandle::Unget->new(\*STDIN);
          }
          # - GAP4 database (extracted to temporary file)
          elsif ($PathSrc =~ m/\.\w$/ and -e "$PathSrc.aux") {
            require SeqAlign::Gap; SeqAlign::Gap->import('&GapSafeCall');
            $this->{PathTmp} = $_LibParam{TmpManag}->Create();
            $call = join (' ', $CorePath{call}{GapExportCons}, '-f Experiment',
              $PathSrc, $this->{PathTmp});
            if ($ret = &GapSafeCall($call)) {
              printf STDERR "%s. ERROR: GAP4 consensus call failed (code $ret):\n"
                . "  $call\n", &MySub, $call;
              exit 1;
            }
            $this->{SrcCurr}{opath} = &PathExpand ($PathSrc);
            $this->{handle} = FileHandle->new($this->{SrcCurr}{path}=$this->{PathTmp});
          }
          # - "normal" input file or pipe call
          else {
            $this->{SrcCurr}{path} = $this->{SrcCurr}{opath}
              = &PathExpand ($PathSrc);
            # not FileHandle::Unget->new($PathSrc,'r') here, because we might
            # deal with pipes and we want to route method new() to perl's std
            # open function.
            $this->{handle} = FileHandle::Unget->new($PathSrc);
          }
          # no longer use $PathSrc down from here, use
          # $this->{SrcCurr}{path} or $this->{SrcCurr}{opath}

          # open file or iterate to next file in queue
          unless ($this->{handle}) {
            printf STDERR "%s. ERROR: failed to open source file %s (orig. %s)\n", &MySub,
              $this->{SrcCurr}{path}||"''", $this->{SrcCurr}{opath}||"''";
            redo GetNextRetry;
          }
          $debug and printf STDERR "%s. opened file %s, handle %s\n", &MySub,
            $this->{SrcCurr}{path}, ref($this->{handle});

          # determine sequence file format
          $this->{SrcCurr}{format} = &SeqFileFormat ($this->{handle}, -debug=>$dbg2);
          if ($this->{SrcCurr}{format}) {
            $debug and printf STDERR "%s. sequence file format is %s (handle %s)\n", &MySub,
              $this->{SrcCurr}{format}||"''", $this->{handle};
          } else {
            if (-s $this->{SrcCurr}{path}) {
              printf STDERR "%s. ERROR: unknown sequence file format in input file %s\n", &MySub,
                $this->{SrcCurr}{path}||"''";
            } else {
              printf STDERR "%s. WARNING: empty file %s\n", &MySub,
                $this->{SrcCurr}{path}||"''";
            }
            undef $this->{handle};
            redo GetNextRetry;
          }
        }

        # file queue empty
        else { return undef }
      }

  ##############################################################################
  # parse

      # chain to appropriate parsing function
      # parse from current file handle
      if    (ref($pFunc=$SeqFFmtGet{$this->{SrcCurr}{format}}{FuncEntry}) eq 'CODE') {
        $debug and printf STDERR "%s. calling parsing function: \$SeqFFmtGet{%s}{FuncEntry}\n"
          . "  supported switches: %s\n", &MySub, $this->{SrcCurr}{format}, join(', ',map{ "$_=>$lopt{$_}" }keys %lopt);
        $pSeq = &$pFunc ($this->{SrcCurr}{opath}, -handle=>$this->{handle}, %lopt);
        $debug and printf STDERR "%s. parsed sequence: %s\n", &MySub,
          $pSeq ? $$pSeq{id}||"''" : '*NONE*';
      }
      elsif (ref($pFunc=$SeqFFmtGet{$this->{SrcCurr}{format}}{FuncArr}) eq 'CODE') {
        $debug and printf STDERR "%s. calling parsing function: \$SeqFFmtGet{%s}{FuncArr}\n"
          . "  switches: %s\n", &MySub, $this->{SrcCurr}{format}, join(', ',map{ "$_=>$lopt{$_}" }keys %lopt);
        # fill sequence buffer with array of sequences (raw entries)
        $this->{SeqBufferRaw} = &$pFunc ($this->{SrcCurr}{opath}, -handle=>$this->{handle}, %lopt);
        $debug and printf STDERR "%s. parsed array of %d sequences - buffered\n", &MySub,
          int @{$this->{SeqBufferRaw}};
        $pSeq = shift @{$this->{SeqBufferRaw}};
      }

      # unknown sequence file format type (but, non-NULL from &SeqFileFormat !)
      else {
        printf STDERR "%s. code ERROR: unknown input sequence file format %s\n", &MySub,
          $this->{SrcCurr}{format}||"''";
        printf STDERR "  sequence file: %s\n", $this->{SrcCurr}{path}||"''";
        undef $this->{handle};
        redo GetNextRetry;
      }

      # no more sequences via current file handle? => recursion
      unless ($pSeq) {
        undef $this->{handle};
        redo GetNextRetry;
      }
    }

  ##############################################################################
  # select, purify/clip sequence string

    # select by sequence identifier / description
    if ($lopt{-MatchID}) {
      $debug and printf STDERR "%s. have sequence ID selector, type HASH\n", &MySub;
      if ($SeqFFmtGet{$this->{SrcCurr}{format}}{-MatchID}) {
        $debug and printf STDERR "%s. sequence ID matching was done in parsing function\n", &MySub;
      } else {
        $debug and printf STDERR "%s. performing sequence ID matching %s\n", &MySub,
          ref($lopt{-MatchID});
        unless ($lopt{-MatchID}{$$pSeq{id}}) { redo GetNextRetry }
      }
    }
    if ($lopt{-SlcID}) {
      $debug and printf STDERR "%s. have sequence ID selector, type regexp: %s\n", &MySub,
        $lopt{-SlcID};
      if ($SeqFFmtGet{$this->{SrcCurr}{format}}{-SlcID}) {
        $debug and printf STDERR "%s. sequence ID selection was done in parsing function\n", &MySub;
      } else {
        $debug and printf STDERR "%s. applying sequence ID selector %s\n", &MySub,
          $lopt{-SlcID};
        if ($$pSeq{id}!~m/$lopt{-SlcID}/) { redo GetNextRetry }
      }
    }
    if ($lopt{-SlcDescr} and !$SeqFFmtGet{$this->{SrcCurr}{format}}{-SlcDescr}) {
      unless ($$pSeq{descr} =~ m/$lopt{-SlcDescr}/) { redo GetNextRetry }
    }

    # filter by sequence identifier / description
    if ($lopt{-FilterID}) {
      if ($$pSeq{id}=~m/$lopt{-FilterID}/) { redo GetNextRetry }
    }
    if ($lopt{-FilterDescr} and $$pSeq{descr}=~m/$lopt{-FilterDescr}/) {
      redo GetNextRetry;
    }

    # select by sequence type
    # a type specification from file format has priority over a diagnosed type
    if ($lopt{-SlcType} and not $SeqFFmtGet{$this->{SrcCurr}{format}}{-SlcType}) {
      $$pSeq{SeqType} ||= &SeqType ($$pSeq{sequence});
      if ($$pSeq{SeqType} !~ m/$lopt{-SlcType}/) { redo GetNextRetry }
    }

    # correct sequence field 'SrcMulti'
    # just for full convenience, $SeqFFmtGet{$this->{SrcCurr}{format}}{FuncEntry}
    #   should have done it correctly
    $this->{SrcCurr}{SrcMulti} ||= $$pSeq{SrcMulti};
    $$pSeq{SrcMulti} ||= $this->{SrcCurr}{SrcMulti};

    # if available, store pure sequence in field 'SeqPure'
    # if wanted, purification should have been done in parsing function
    if ($lopt{-pure} and not $SeqFFmtGet{$this->{SrcCurr}{format}}{-pure}) {
      $pSeq = &SeqPure ($pSeq, -SeqType=>$lopt{-pure}, -debug=>$dbg2);
    }

    # cut off unknown symbols at ends of the sequence string
    if ($lopt{-ClipUnk} and not $SeqFFmtGet{$this->{SrcCurr}{format}}{-ClipUnk}) {
      $RegexpUnk = '['. lc ($SeqSmbUnk{&SeqType($$pSeq{sequence})}) . uc ($SeqSmbUnk{&SeqType($$pSeq{sequence})}) .' \n\r-]';
      $$pSeq{sequence} =~ s/^($RegexpUnk*)// and delete $$pSeq{SeqPure};
      $$pSeq{sequence} =~ s/($RegexpUnk*)$// and delete $$pSeq{SeqPure};
    }

    # select for length (pure sequence string)
    if ($lopt{-SlcLen}) {

      # need pure copy of sequence
      $$pSeq{SeqPure} ||= &SeqStrPure ($$pSeq{sequence});
      $iSeqLen ||= length $$pSeq{SeqPure};

      # do selection
      if ($iSeqLen < $lopt{-SlcLen}[0]) { redo GetNextRetry }
      if ($lopt{-SlcLen}[1] and $iSeqLen > $lopt{-SlcLen}[1]) { redo GetNextRetry }
    }

    # upper / lower case letters in sequence string
    $lopt{-lower} and $$pSeq{sequence} =~ tr/A-Z/a-z/;
    $lopt{-upper} and $$pSeq{sequence} =~ tr/a-z/A-Z/;

  ##############################################################################
  # split entry

    # select sequence ends
    # - we do not handle annotations correctly, from here on
    if ($lopt{-SlcEnds}) {

      # need pure copy of sequence
      $$pSeq{SeqPure} ||= &SeqStrPure ($$pSeq{sequence});
      $iSeqLen ||= length $$pSeq{SeqPure};

      # action to select sequence ends
      if ($iSeqLen > 2 * $lopt{-SlcEnds}) {
        $debug and printf STDERR "%s. selecting ends (%d letters) from sequence %s\n", &MySub,
          $lopt{-SlcEnds}, $$pSeq{id}||"''";

        # delete length information in sequence description
        # positional information entries are obsolete now
        delete $$pSeq{header};
          $$pSeq{descr} =~ s/([,;] +)?\d+\s+(aa|bases|base pairs|bp|characters|letters|residues)//g;
          delete $$pSeq{annot};

        # create sequence data structures for ends
        my $pSeqWait = &DataClone ($pSeq, -debug=>$dbg2);
        $$pSeq{sequence} = substr ($$pSeq{SeqPure}, 0, $lopt{-SlcEnds});
          $$pSeq{id} .= '_5end';
          $$pSeq{SeqPure} = $$pSeq{sequence};
          $$pSeq{SrcMulti} = 1;
        $$pSeqWait{sequence} = substr ($$pSeqWait{SeqPure}, -$lopt{-SlcEnds}, $lopt{-SlcEnds});
          $$pSeqWait{id} .= '_3end';
          $$pSeqWait{SeqPure} = $$pSeqWait{sequence};
          $$pSeqWait{SrcMulti} = 1;
        push @{$this->{SeqBuffer}}, $pSeqWait;
      }
    }

  } # end of retry block

  # return sequence entry
  $this->{NmbRead} ++;
  return $pSeq;
}


1;
# $Id: SeqStreamIn.pm,v 1.26 2018/06/05 18:02:56 szafrans Exp $
