################################################################################
#
#  kPerl Sequence Laboratory
#  Object Library for Sequential Sequence File Output
#
#  copyright (c)
#  Karol Szafranski on behalf of IMB Jena, Genome Analysis, 2000,2002,2004
#  szafrans@imb-jena.de
#  Karol Szafranski at UPenn Philadelphia, Center for Bioinformatics, 2004
#  karol@pcbi.upenn.edu
#  Karol Szafranski on behalf of IMB Jena, Genome Analysis, 2005
#  szafrans@imb-jena.de
#
################################################################################
#
#  DESCRIPTION
#
# - individual description of functions at the beginning of the code blocks
#
# - this class provides sequential sequence output to all sequence file
#   formats that're available via functions referenced in
#   %SeqLab::SeqFormat::SeqFFmtOutFunc.
#
################################################################################
#
#  OBJECT OPERATORS
#
#   *** NONE ***
#
#
#  OBJECT METHODS  for external access
#
# - housekeeping
#   new           create object, initialise via ini(@args)
#   ini           initialise object
#   AddSwitch     modify object options
#
# - functionality
#   Push          add sequence elements to the file queue
#   Flush         output all sequence entries buffered in queue
#                 This method does not influence the output target directives.
#                 Subsequent flushes onto the same output target are possible.          
#
#
#  OBJECT DATA STRUCTURE
#  (hash)
#
#   handle        current output file handle
#   NmbWrote      number of output sequences
#   PathDest      current destination path/pipe
#   PathDestTmp   temporary path for output of buffered sequences. This is
#                 only needes for switch -file=rewrite when output path may
#                 be equal to current input path.
#   SeqBuffer     reference to array of buffered sequence entries
#                 The buffer is ensured to contain sequence entries, that are
#                 destined for one single output target (managed by method Push)
#   switch        hash reference for object switches, cf. method AddSwitch()
#     -debug        print debug protocol to STDERR
#     -dir=S        write output files into specified directory. This value
#                   overrides any directory specifications provided via switch
#                   -file.
#     -file         specify output path or a pipe command string (including a
#                   trailing "|"). Path argument "-" is interpreted as STDOUT
#                   which is the default output direction.
#     -file=>'rewrite'
#                   preserve the file structure as in sequence input
#                   (determined from 'SrcPath' and 'SrcFmt' of sequence entries).
#                   Combination with other switches:
#                     -format   rewrite in specified file format
#                     -dir      rewrite files into specified directory
#     -file=>'SingleSeq'
#                   write single-sequence output with filename identical to
#                   sequence ID
#     -format       output sequence file format, default: fastA.
#                   Note: this switch has priority versus switch
#                   -file=>'rewrite'.
#                   A format specifier may be any of those defined in
#                   %SeqLab::SeqFormat::SeqFFmtOutFunc.
#     -IdFormat     apply modify actions to the identifier field, default: do not
#     -KeyId        use specified key to access sequence identifier (default:
#                   $SeqLab::SeqFormat::LibGlob{KeyId}). The success of this
#                   option depends on the output formatting function.
#     -KeySeq       use specified key to access sequence string (default:
#                   $SeqLab::SeqFormat::LibGlob{KeySeq}). The success of this
#                   option depends on the output formatting function.
#     -pipe         pipe output through system command statement. The statement
#                   needs to include a leading pipe character.
#     -pure         purify sequence strings for sequence-coding letters
#                   You may specify a sequence type. Then, fuzzy letters are
#                   converted to official 'unknowns'.
#                   Positional references are adjusted if needed.
#                   Upper/lower-case appearance of the sequence string remains
#                   unchanged.
#     -sort         sort sequence entries in each output file according to
#                   specified criterion:
#                     id     sequence ID
#                     descr  description entry
#     -TmpPreserve  do not unlink temporary files, propagate option -preserve
#                   to global manager of temporary files
#
################################################################################
#
#  FUNCTIONS, DATA
#
#   @ISA
#   %LibGlob
#   %_LibParam
#
# - basics
#   $_LibParam{TmpManag}
#   $_LibParam{default}
#   &new  see MainLib::DefaultObjHash.pm
#   &ini
#   $LibGlob{FileMagic} - magic values to option -file
#   &AddSwitch
#   &_LocalSwitch  see MainLib::DefaultObjHash.pm
#   &DESTROY
#
# - determine output directives
#   &_CurrFmt
#   &_CurrPath
#   &_CurrHandle
#
# - functionality
#   &Push
#   &Flush
#   &_CloseCurr
#   &_BufferSort
#   &_Write
#
################################################################################
#
#  DEVELOPER'S NOTES
#
# - calling hierarchy:
#   -> new
#      -> ini
#   -> AddSwitch
#     ?-> Flush ; _CloseCurr
#         -> _Write ->...
#   -> Push
#     ?-> Flush ; _CloseCurr
#         -> _Write ->...
#     ?-> _Write ->...
#   -> _Write
#      -> _CurrFmt
#      -> _CurrHandle
#        ?-> _CloseCurr
#         -> _CurrPath
#      -> &SeqLab::SeqFormat::SeqentryToFFmt
#   -> DESTROY
#      -> Flush
#      -> _CloseCurr
#
#
#  DEBUG, CHANGES, ADDITIONS
#
# - for construction of `$_LibParam{TmpManag} = MainLib::FileTmp->new' try
#   something like:
#   `$_LibParam{TmpManag} = MainLib::FileTmp->new(-package=>__PACKAGE__'
#   The according option would have to be implemented in MainLib::FileTmp.
#
# - look also for notes in the header of each function block
#
################################################################################

package SeqLab::SeqStreamOut;

# includes
#use strict; use warnings;  # OK 20050711
use FileHandle;
use MainLib::Data qw(&DataClone &DataPrint);
use MainLib::DefaultObjHash;
use MainLib::Path;
use MainLib::File;
use MainLib::FileTmp;
use MainLib::Misc qw(&MySub);
use Math::Calc qw(&Max);
use SeqLab::SeqFormat qw (%SeqidFunc &SeqidWord &SeqentryToFFmt);

# inheritance
our @ISA;
push @ISA, qw(MainLib::DefaultObjHash);

# package-wide constants and variables
our (%LibGlob);
my %_LibParam;


################################################################################
# basics
################################################################################


# handle temporary files globally
# - encapsulation in a sub{} allows MainLib::FileTmp to identify the calling
#   package
my $pcFT = sub{ $_LibParam{TmpManag} = MainLib::FileTmp->new(); };
&$pcFT;


# default object data
$_LibParam{default} = {
  switch => {
    -file => '-',
    },
  };
$_LibParam{DefaultFmt} = 'fastA';

# parametric initialisation
#
# INTERFACE
# - argument 1*: hash of object switches
# - return val:  object reference
#
sub ini {
  my ($this, %opt) = @_;

  # initialise tuple data structure
  # Always assign the default data structure. This definitely overrides all
  #   previous contents of %$this.
  %$this = %{ &DataClone($_LibParam{default}) };

  # enter object switches
  unless ($this->AddSwitch(%opt)) {
    printf STDERR "%s. ERROR in method AddSwitch\n", &MySub;
  }

  # return
  return $this;
}


$LibGlob{FileMagic} = [ qw(rewrite SingleSeq) ];
  # accessed by SeqHandle.pl

# enter object switches
#
# INTERFACE
# - argument 1*: hash of switches
# - return val:  success status (boolean)
#
# DESCRIPTION
# - for description of object switches see above
#
sub AddSwitch {
  my ($this, %oopt) = @_;
  my $debug = $this->{switch}{-debug} || $oopt{-debug};

  # loop over switches
  my ($bErr);
  while (my($key,$val) = each(%oopt)) {
    if (0) {}

    # delegate -debug to "MainLib::FileTmp"
    elsif ($key eq '-debug' and ($val||0)>1) {
      $bErr ||= ! $_LibParam{TmpManag}->AddSwitch($key=>$val-1,-preserve=>1);
    }

    # delegate -TmpPreserve to "MainLib::FileTmp"
    elsif ($key eq '-TmpPreserve') {
      $bErr ||= ! $_LibParam{TmpManag}->AddSwitch(-preserve=>$val);
    }

   # from here on, each case block exclusively defines the action of object
   # re-shaping associated with that switch
    if (0) {}

    # option -dir
    # - test existence
    # - flush buffer, initialise output directives
    elsif ($key eq '-dir' and ($val||'') ne ($this->{switch}{$key}||'')) {
      unless (-d $val) {
        if (-e $val) {
          printf STDERR "%s. ERROR: output destination exists, but is not a directory: %s\n", &MySub, $val||"''";
          $bErr = 1;
          next;
        } else {
          printf STDERR "%s. ERROR: output directory does not exist: %s\n", &MySub, $val||"''";
          $bErr = 1;
          next;
        }
      }
      $this->Flush(); $this->_CloseCurr();
      $this->{switch}{$key} = $val;
      if ($this->{switch}{-file}
      and &PathSplit($this->{switch}{-file})->{dirstmt}) {
        $debug and printf STDERR "%s. WARNING: directory specification in -file switch (%s) clashes with -dir switch (%s)\n", &MySub,
          $this->{switch}{-file}, $this->{switch}{-dir};
      }
    }

    # option -file
    # - flush buffer, initialise output directives
    elsif ($key eq '-file') {
      # modified output directive
      if (($val||'') ne ($this->{switch}{$key}||'')) {
        $this->Flush(); $this->_CloseCurr();

        # effective output directive
        if (length ($val||'')) {
          unless (&PathSplit($val)->{name}) {
            printf STDERR "%s. ERROR: value for -file switch (\"%s\") doesn't contain any effective filename\n", &MySub, $val||"''";
            $bErr = 1;
            next;
          }
          $this->{switch}{$key} = $val;
          if ($this->{switch}{-dir}) {
            $debug and printf STDERR "%s. WARNING: directory specification in -file switch (%s) clashes with -dir switch (%s)\n", &MySub,
              $this->{switch}{-file}, $this->{switch}{-dir};
          }
        }
        # clear output directive
        else {
          $this->{switch}{$key} = '-';
        }
      }
    }

    # option -format
    # - flush buffer, initialise output directives
    elsif ($key eq '-format' and ($val||'') ne ($this->{switch}{$key}||'')) {
      $this->Flush(); $this->_CloseCurr();
      $this->{switch}{$key} = $val;
    }

    # option -pipe
    # - check possible conflicts with other options
    #   Some options may conflict depending on the pipe used, e.g.:
    #   -format may clash with input restrictions of the pipe command
    elsif ($key eq '-pipe' and ($val||'') ne ($this->{switch}{$key}||'')) {
      if ($val) {
        if ($this->{switch}{-file}=~m/^(rewrite|SingleSeq)/) {
          print  STDERR "%s. ERROR: other options conflict with option -pipe\n", &MySub;
          exit 1;
        }
      }
      $this->Flush(); $this->_CloseCurr();
      $this->{switch}{$key} = $val;
    }

    #options that we just have to enter
    else {
      if (defined $val) { $this->{switch}{$key} = $val; }
      else       { delete $this->{switch}{$key}; }
    }
  }

  # return success status
  return !$bErr;
}


# what if i shall die?
#
sub DESTROY {
  my ($this) = @_;

  # flush any buffered sequences
  $this->Flush();
  $this->_CloseCurr();
}


################################################################################
# determine output directives
################################################################################


# return current file format for sequence output
#
# INTERFACE
# - argument 1: reference to sequence data structure
# - return val: file format descriptor, see SeqLab::SeqFormat.pm
#
sub _CurrFmt {
  my ($this, $pSeq) = @_;
  my $debug = $this->{switch}{-debug};

  my ($format);

  # priority of custom format
  if (exists($this->{switch}{-format}) and $format=$this->{switch}{-format}) {
    $debug and printf STDERR "%s. priority of custom format: \"%s\"\n", &MySub, $format;
    return $format;
  }

  # rewrite mode?
  if ($this->{switch}{-file} eq 'rewrite') {
    $format = $$pSeq{SrcFmt} || $_LibParam{DefaultFmt};
    $debug and printf STDERR "%s. current format is: \"%s\" (from %s || %s)\n", &MySub,
      $format, $$pSeq{SrcFmt}||"''", $_LibParam{DefaultFmt}||"''";
    return $format;
  }

  # default format
  $format = $_LibParam{DefaultFmt};
  $debug and printf STDERR "%s. default format: %s\n", &MySub, $format||"''";
  return $format;
}


# determine output file (final) for specified sequence entry
#
# INTERFACE
# - argument 1: reference to sequence data structure
# - options:    are not accepted here! Use method AddSwitch()
# - return val: - output file path or filehandle reference
#               - undef if an error occurred
#
# DESCRIPTION
# - no updates are done to the object data structure
#
sub _CurrPath {
  my ($this, $pSeq) = @_;
  my $debug = $this->{switch}{-debug};

  # file as specified
  my $PathCurr = $this->{switch}{-file};

  # rewrite to input file?
  if ($PathCurr eq 'rewrite') {
    unless ($PathCurr = $$pSeq{SrcPath}) {
      printf STDERR "%s. ERROR: sequence entry %s doesn't hold source path\n", &MySub, $$pSeq{id};
      return undef;
    }
  }

  # write to single-sequence file?
  elsif ($PathCurr eq 'SingleSeq') {
    $PathCurr = $$pSeq{id};

    # shorten long GenBank identifier line to Acc.No.
    # restrict char usage in identifier
    if ($PathCurr =~ m/\|/) {
      $PathCurr = &SeqidWord ($PathCurr);
    }
    # restrict char usage in identifier
    $PathCurr =~ s/[^a-zA-Z0-9_.-]+//g;
    # check consistency
    unless ($PathCurr) {
      printf STDERR "%s. ERROR: sequence entry without effective sequence ID\n", &MySub;
      return undef;
    }
  }

  # prepend directory path as specified
  if (
    $this->{switch}{-dir} and -d $this->{switch}{-dir} and
    $PathCurr ne '-' and !ref($PathCurr)
  ) {
    my $PathName = &PathSplit($PathCurr)->{name};
    unless ($PathName) {
      printf STDERR "%s. ERROR: path string %s doesn't contain file name\n", &MySub, $PathCurr;
      return undef;
    }
    $debug and printf STDERR "%s. prepending directory path %s\n", &MySub, $this->{switch}{-dir};
    $PathCurr = $this->{switch}{-dir} .'/'. $PathName;
  }

  # pipe output?
  if ($this->{switch}{-pipe} and !ref($PathCurr)) {
    $debug and printf STDERR "%s. prepending pipe command:\n  %s\n", &MySub, $this->{switch}{-pipe};
    my $PipeCmd = $this->{switch}{-pipe};
    if ($PathCurr and $PathCurr ne '-') {
      $PipeCmd .= " > $PathCurr";
    }
    $PathCurr = $PipeCmd;
  }

  # return path / pipe command
  $debug and printf STDERR "%s. returning path %s\n", &MySub, $PathCurr||"''";
  return $PathCurr;
}


# update output directives, return current file handle
#
# INTERFACE
# - argument 1: reference to current sequence data structure
# - options:    aren't accepted here! Use method AddSwitch()
# - return val: - filehandle reference
#               - undef if an error occurred
#
# DEVELOPER'S NOTES
# - argument 1: reference to current sequence data structure
#
sub _CurrHandle {
  my ($this, $pSeq) = @_;
  my $debug = $this->{switch}{-debug};

  my ($PathCurr, $PathEff);

  # derive output file in context of object switch -file=rewrite
  # we need to check for a change of the input file
  if ($this->{switch}{-file} eq 'rewrite') {

    # continue on current file
    # NOTE: $this->{handle} corresponds to $this->{PathDestTmp} rather than
    #   $this->{PathDest}. But, we use $this->{PathDest} for comparison since
    #   it contains the effective output path.
    if ($this->{handle} and
        $this->{PathDest} eq ($PathCurr=$this->_CurrPath($pSeq))
    ) {
      $debug and printf STDERR "%s. returning existing handle for file %s\n", &MySub,
        $this->{PathDestTmp}||$this->{PathDest}||"''";
      return $this->{handle};
    } else {
      $this->_CloseCurr();
    }
  }

  # derive output file in context of object switch -file=SingleSeq
  # start on new single-sequence output file
  if ($this->{switch}{-file} eq 'SingleSeq') {
    $this->_CloseCurr();
  }

  # turn to new output file
  unless ($this->{PathDest} and $this->{handle}) {
    $this->{PathDest} = $PathCurr ||= $this->_CurrPath($pSeq);
    $debug and printf STDERR "%s. turning to destination file %s\n", &MySub,
      $this->{PathDest}||"''";

    # do we have to protect a current input file?
    if ($$pSeq{SrcPath} and $PathCurr eq $$pSeq{SrcPath}) {
      $this->{PathDestTmp} = $_LibParam{TmpManag}->Create(-touch=>1);
    } else {
      delete $this->{PathDestTmp};
    }

    # get file handle
    $PathEff = $this->{PathDestTmp} || $this->{PathDest};
    if ($PathEff eq '-') {
      $this->{handle} = FileHandle->new_from_fd(&TrueStdout(),'w');
    } elsif ($PathEff=~m/\|/) {
      # channel input pipes to standard open function, mode statement "w" is
      # not required, here
      $this->{handle} = FileHandle->new($PathEff);
    } else {
      $this->{handle} = FileHandle->new($PathEff,'w');
    }
  }

  # return active file handle
  return $this->{handle};
}


################################################################################
# functionality
################################################################################


# enter sequence(s) into the queue
#
# INTERFACE
# - argument 1+: array of references to sequence data structure(s)
# - options:     aren't accepted here! Use method AddSwitch()
# - return val:  success status (boolean)
#
sub Push {
  my ($this, @arg) = @_;
  my $debug = $this->{switch}{-debug};
  my $bBufferForFile = (
    $this->{switch}{-sort} or
    ($this->{switch}{-file}||'') eq 'rewrite' or
    ($this->{switch}{-format}||'') eq 'selex' or
    0);
  if ($this->{switch}{-file} eq 'SingleSeq') { $bBufferForFile = 0 }

  # loop over sequences
  my $bErr;
  foreach my $pSeq (@arg) {
    $debug and printf STDERR "%s. entering sequence %s\n", &MySub, $$pSeq{id};

    # empty buffer if output destination changes
    if (@{$this->{SeqBuffer}||[]} and
        $this->_CurrPath($this->{SeqBuffer}[-1]) ne $this->_CurrPath($pSeq)
    ) {
      $debug and printf STDERR "%s. output destination has changed, flushing buffer\n", &MySub;
      $this->Flush();
      $bErr ||= ! $this->_CloseCurr();
      delete $this->{SeqBuffer};
    }

    # buffer sequence?
    if ($bBufferForFile) {
      push @{$this->{SeqBuffer}||=[]}, $pSeq;
      $debug and print  STDERR "  buffered\n";
    }
    # ... else output sequence
    else {
      $bErr ||= ! $this->_Write($pSeq);
      $debug and print  STDERR "  written\n";
    }
  }
  
  # return success status
  return !$bErr;
}


# flush complete buffer, move output to final destination
#
# INTERFACE
# - return val: success status (boolean)
#
# DESCRIPTION
# - apply -sort directive to buffered sequences if present (method
#   _BufferSort()).
# - this method does not influence the current output target directives.
#   Subsequent flushes onto the same output target are possible.
#
sub Flush {
  my ($this) = @_;
  my $debug = $this->{switch}{-debug};

  # any sequences in buffer to be flushed?
  my ($bSucc);
  if (@{$this->{SeqBuffer}||[]}) {
    # sort - necessity is tested there
    $this->_BufferSort();

    # write sequences to file
    if ($bSucc = $this->_Write(@{$this->{SeqBuffer}})) {
      delete $this->{SeqBuffer};
    }
    $bSucc or printf STDERR "%s. ERROR in writing buffered sequences\n", &MySub;
    return $bSucc;
  }

  # ... otherwise, there's nothing to do
  else {
    $debug and printf STDERR "%s. empty buffer\n", &MySub;
    return 1;
  }
}


# assure empty buffer, move output to final destination, erase current output
#   directives
#
# INTERFACE
# - return val: success status (boolean)
#
sub _CloseCurr {
  my ($this) = @_;
  my %oopt = %{$this->{switch}};
  my $debug = $oopt{-debug};

  # any sequences in buffer? => would be bad
  if (@{$this->{SeqBuffer}||[]}) { return 0 }

  # close output file handle
  delete $this->{handle};

  # move temporary output file?
  if ($this->{PathDestTmp}) {
    $debug and printf STDERR "%s. there's a temporary file: %s\n", &MySub, $this->{PathDestTmp};
    if (-s $this->{PathDestTmp}) {
      $debug and print  STDERR "  move it.\n";
      &mv ($this->{PathDestTmp}, $this->{PathDest});
    } else {
      $debug and print  STDERR "  delete it.\n";
      if (!$debug and !$oopt{-TmpPreserve}) { unlink $this->{PathDestTmp}; }
    }
  }

  # reinitialise output directives
  delete $this->{PathDest};
  delete $this->{PathDestTmp};

  # successful exit
  return 1;
}


# sort sequences in buffer according to -sort switch
#
sub _BufferSort {
  my ($this) = @_;
  my ($SortBy);

  # sort?
  if (exists($this->{switch}{-sort}) and $SortBy=$this->{switch}{-sort}) {
    @{$this->{SeqBuffer}} = sort {
      # this currently works for ascending ID / description
      $$a{$SortBy} cmp $$b{$SortBy} 
      } @{$this->{SeqBuffer}};
  }
}


# write sequence into appropriate file handle
#
# INTERFACE
# - argument 1*: reference(s) to sequence data structure(s)
# - return val:  success status (boolean)
#
# DESCRIPTION
# - the appropriate file handle will be derived via method _CurrHandle().
#
sub _Write {
  my ($this, @seq) = @_;

  # function parameters
  int(@seq) or return 1;
  my %oopt = %{$this->{switch}};
  my $debug = $oopt{-debug};
  my $format = $this->_CurrFmt($seq[0]);
  my $KeyId = $oopt{-KeyId} || $SeqLab::SeqFormat::LibGlob{KeyId};
  $debug and printf STDERR "%s. entered sub, %d sequence%s, ID key %s, output format %s\n", &MySub,
    int(@seq), (int(@seq)==1)?'':'s', $format;

  # selex format: determine maximum ID string length
  my ($iSeqidMax);
  my $bSelex = ($format eq 'selex');
    # $bSelex used later in formatting process
  if ($bSelex) {
    $iSeqidMax = &Max (map { length($_->{$KeyId}) } @seq);
  }

  # loop over sequences
  my ($ErrSum);
    # meant to be a counter for erroneous sequences
  my $pSeq;
  foreach $pSeq (@seq) {
    my ($bErr);

    # format sequence identifier
    if ($oopt{-IdFormat}) {
      $$pSeq{$KeyId} = &{$SeqidFunc{$oopt{-IdFormat}}} ($$pSeq{$KeyId});
      delete $$pSeq{header};
      $bErr ||= ! $$pSeq{$KeyId};
    }
    # format whole sequence entry
    my $sSeq = &SeqentryToFFmt ($pSeq, -format=>$format,
      %oopt, $bSelex ? (-delimit=>(' 'x($iSeqidMax+2-length($$pSeq{$KeyId})))):(),
      );
    if ($bErr ||= !length($sSeq)) {
      printf STDERR "%s. ERROR in sequence formatting, seq %s - skipping output\n", &MySub, $$pSeq{id};
      $ErrSum += $bErr;
      next;
    }

    # write to file
    my $hOut = $this->_CurrHandle($pSeq);
    if ($bErr ||= !$hOut) {
      printf STDERR "%s. ERROR in retrieving filehandle, seq %s, file %s\n", &MySub,
        $$pSeq{id}, $this->_CurrPath($pSeq);
      printf STDERR "  skipping output\n";
      $ErrSum += $bErr;
      next;
    }
    $bErr ||= ! print $hOut $sSeq;
    if ($bErr) {
      printf STDERR "%s. ERROR in output call, seq %s, file %s\n", &MySub,
        $$pSeq{id}, $this->_CurrPath($pSeq);
    }

    # summary
    $ErrSum += $bErr;
    $this->{NmbWrote} ++;
    $debug and printf STDERR "%s. sequence %s output to file %s -> %s\n", &MySub,
      $$pSeq{id}, $this->{PathDestTmp}||$this->{PathDest}||"''", $ErrSum?'error':'success';
  }

  # return success status
  return !$ErrSum;
}


1;
# $Id: SeqStreamOut.pm,v 1.20 2005/07/13 16:17:01 szafrans Exp $
