################################################################################
#
#  kPerl GSCJ ReadWatch Concept
#  Object Library for a File Index on Sequencing Reads
#
#  copyright (c)
#    Institute of Molecular Biotechnology Jena, Dept. Genome Analysis, 1999-2000, 2004
#  author
#    Karol Szafranski, karol.szafranski@leibniz-fli.de
#
################################################################################
#
#  DESCRIPTION
#
# - An index object meets the following purposes:
#
#   - see which read entries are present in the file structure:
#     Retrieve()    in a boolean context
#     FirstExtra()  extrapolates the existing read entries to the first
#                   non-existing
#   - retrieve the file path for a read entry
#     Retrieve()    in a boolean context
#
#   Basically, this object class should be derived from a general table
#   index object. However, we don't have this general object. May be,
#   somewhen there's time to create it.
#
# - individual description of functions at the beginning of the code blocks
#
# - For initialization, an index file has to be loaded in the fashion:
#   obj->new ($file)
#   the index file is in tab-delimited table format. Columns should be
#   organized in the following order:
#   0. ...
#   1. ...
#
################################################################################
#
#  OBJECT OPERATORS
#
#   *** NONE ***
#
#
#  OBJECT METHODS  for external access
#
# - housekeeping
#   new            create object, initialize via ini(@args)
#                  @args is a variable-length list of input files or input pipes.
#                  Argument "-" works as an alias to STDIN. Input handles are not
#                  allowed.
#   ini            (re-)initialize file queue with given list of files. This is
#                  called by new() in order to initialize the object but it may
#                  be used from outside also. However, as it has the same effect
#                  like creating a new object it will be more transparent to
#                  explicitly call new() instead of ini().
#   Clear          clear file queue
#                  Object options are preserved.
#   Load           load index data from file(s)
#   AddTreeIndex   add tree index to index data
#
# - functionality
#   Retrieve       path of trace item
#   FirstExtra     extrapolates the existing read entries to the first
#                  non-existing
#
#
#  OBJECT DATA STRUCTURE
#  (hash)
#
#  base            reference to root directories for read file tree
#    exper           root directory for Experiment files ("group" directory)
#    raw             root directory for raw data files
#  index           reference to data structure of read index
#  PathSource      reference on array of index input path(s)
#  switch          hash reference for object switches
#                  set options just by assignment
#    -debug          print debug protocol to STDERR
#    -FullMatch      perform full string matching when searching in index
#  time            time of index construction
#  TreeIndex       tree indices for fast sub-string match:
#                  CloneStout   clone -> primer + number
#                  ReadType     read type (clone + primer) -> number
#
################################################################################
#
#  FUNCTIONS, DATA
#
#   @ISA
#   %LibGlob
#   %_LibParam
#
# - housekeeping
#   $_LibParam{default}
#   $LibGlob{default}
#   &new  see MainLib::DefaultObjHash.pm
#   &ini
#   &_LocalSwitch  see MainLib::DefaultObjHash.pm
#   &Clear
#   &Load
#   &AddTreeIndex
#
# - look-up operations
#   &Retrieve
#   &_RetrvExtended
#   &FirstExtra
#
################################################################################
#
#  DEBUG, CHANGES, ADDITIONS
#
# - look also for notes in the header of each function block
#
################################################################################

package ReadWatch::ReadIndex;

# includes
use strict; #use warnings;  # OK 20040906
use MainLib::DefaultObjHash;
use MainLib::Data qw(&DataClone);
use MainLib::Path qw(%CorePath);
use database::DbPlain;
use ReadWatch::Read;

# inheritance
our @ISA;
push @ISA, qw(MainLib::DefaultObjHash);

# package-wide constants and variables
our (%LibGlob);
my %_LibParam;


################################################################################
# housekeeping
################################################################################


# class parameters
$_LibParam{default} = {
  base  => {
    raw   => $CorePath{ReadWatch}{RawGroupDir},
    exper => $CorePath{ReadWatch}{ExperGroupDir},   # used by AlnProj.pm
    },
  index => $CorePath{ReadWatch}{ExperIndex},
  };
$LibGlob{default} = $_LibParam{default};


# parametric initialization
#
# INTERFACE
# - return val: - instance reference
#               - undef if an error occurred
#
sub ini {
  my ($this, @arg) = @_;

  # initialization
  $this->Clear();
  $this->{base} = &MainLib::Data::DataClone ($_LibParam{default}{base});

  # load index, eventually load default index file
  $this->Load (@arg);
  
  # return object reference
  return $this;
}


# drop index
#
sub Clear {
  my ($this) = @_;

  # initialize input path array and indices
  $this->{PathSource} = [];
  $this->{index} = undef;
  $this->{TreeIndex} = undef;

  # drop construction time
  $this->{time} = undef;
}


# load index from table file
#
# INTERFACE
# - argument(s): path(s) of input file(s), i.e. read indices,
#                default:  $_LibParam{default}{index}
#
# DESCRIPTION
# - the index is not initialized cause you may want to add index files
#   into the index structure by successive Load() calls.
#
sub Load {
  my ($this, @arg) = @_;
  my ($class, $debug, $dbg2);
  my ($file, $pIndex, @EntryNew, $bChanges);

  # function parameters
  $class = ref ($this);
  $debug = $this->{switch}{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  @arg or @arg = ( $_LibParam{default}{index} );

  # loop over input path args
  foreach $file (@arg) {

    # add file to input path array
    push @{$this->{PathSource}}, $file;

    # read table
    $pIndex = &PlainToTable ($file, -TabType=>'HIA', -comments=>1, -debug=>$dbg2);
    if (! $pIndex or ! int keys (%$pIndex)) {
      printf STDERR "%s. non-existing or empty index file %s\n", (caller(0))[3], $file||"''";
    }
    if ($debug) {
      printf STDERR "%s. got %d index entries from file %s\n", (caller(0))[3],
        int keys (%$pIndex), $file||"''";
      printf STDERR "  first entry: %s\n", join ('  ', @{ (values(%$pIndex))[0] });
    }
    
    # new entries in file? => enter entries to index
    if (@EntryNew = keys %$pIndex) {
      $bChanges = 1;

      # add to existing index
      # - duplicate IDs are overwritten, last encountered entry wins
      if (%{$this->{index}||{}}) {
        @{$this->{index}}{@EntryNew} = @$pIndex{@EntryNew};
      }
      
      # construct index
      else {
        $this->{index} = $pIndex;
      }
    }
  }

  # new entries in file? => enter entries to index
  if ($bChanges) {

    # update tree indices
    # what if this is the first load operation ??? The tree index needs to get
    #  initialized somehow, doen't it?
    foreach my $ItIdxType (keys %{$this->{TreeIndex}}) {
      $this->AddTreeIndex($ItIdxType);
    }

    # enter time of construction touch
    $this->{time} = time;
  }
}


# create tree index on reads
#
# INTERFACE
# - argument 1:  tree type specifier
#                see &ReadWatch::Read::ReadTreeIndex for available types
#                Currently, you may use only those tree indices which have
#                  one dendritic architecture: CloneStout, ClonePrm, ReadType
# - return val:  success status (boolean)
#
sub AddTreeIndex {
  my ($this, $IndexType) = @_;
  my ($pField, $CtId);

  # create extra index pointing to read entries
  $this->{TreeIndex}{$IndexType} = &ReadTreeIndex (
    [keys %{$this->{index}}],
    -format=>$IndexType);
  
  # success?
  return $this->{TreeIndex}{$IndexType} ? 1 : 0;
}


################################################################################
# look-up operations
################################################################################


# return paths of sequence read files according to ID selector
#
# INTERFACE
# - argument 1:  one of these target specifiers (case-insensitive):
#                Exper       Experiment file, read name
#                ExperSub    Experiment file, sub-path
#                ExperFull   Experiment file, full path
#                Sample      [not yet implemented] sample number
#                SampleFull  [not yet implemented]
# - argument 2+: read ID selector(s)
#                a selector is applied to the index in three look-up steps
#                1. full match search
#                2. full match search via secondary index
#                   cmp. $this->{TreeIndex} and $this->TreeIndex()
#                3. regexp match
#                   this search may be very slow on large indices. This
#                   search step may be switched off via object option
#                   -FullMatch.
# - return val:  array of (sub-)paths (.../target/machine/read-sample)
#
sub Retrieve {
  my ($this, $target, @arg) = @_;
  my ($class, $debug, $dbg2);
  my ($selector, $TgtType, $TgtFull, $TgtSub);
  my ($entry, $machine, @Slc1, %Slc2);

  # function parameters
  $class = ref ($this);
  $debug = $this->{switch}{-debug};
  $dbg2  = $debug ? $debug-1 : undef;

  # what type of info is to be returned?
  $target =~ tr/A-Z/a-z/;
  unless ($target =~ m/^(exper|sample)(full)?(sub)?$/) { return () }
  $TgtType = $1;
  $TgtFull = $2 ? $this->{base}{$TgtType} : '';
  $TgtSub  = $3;

  # select ID entries from index
  foreach $selector (@arg) {
    $debug and printf STDERR "%s. doing retrieval for '$selector'\n", (caller(0))[3];
    @Slc1 = ();

    # ID with direct match
    if ($this->{index}{$selector}) {
      @Slc1 = ($selector);
    } else {

      # IDs via extended indices
      if ($this->{TreeIndex}) {
        @Slc1 = $this->_RetrvExtended($selector);
      }

      # IDs via RegExp match
      unless (@Slc1 or $this->{switch}{-FullMatch}) {
        @Slc1 = grep { m/$selector/ } keys (%{$this->{index}});
      }
    }

    # non-redundant list of Experiment sub-paths
    foreach (@Slc1) {
      if ($TgtFull or $TgtSub) {
        $machine = $this->{index}{$_}[0];
        if ($TgtType eq 'sample') { $machine =~ s/_/ /g; }
        $Slc2{$_} = $TgtFull .'/'. $machine .'/'. $_;
      } else {
        $Slc2{$_} = $_;
      }
    }
  }

  # return paths
  return values (%Slc2);
}


# look for selector matches within tree indices
#
# INTERFACE
# - argument 1:  read ID selector
# - return val:  array of matching read IDs
#
# DEBUG, CHANGES, ADDITIONS
# - tree indices only work if they're organized in two-level hierarchy
#   (tree index types 'CloneStout'/'ReadType').
#   Implement multiple-level hierarchy feature using recursive call of
#   new code block!
#
sub _RetrvExtended {
  my ($this, $selector) = @_;
  my ($pIdx) = @_;

  # loop over tree indices
  foreach $pIdx (values %{$this->{TreeIndex}}) {

    # selector with direct match => successful return
    if ($$pIdx{$selector}) {
      return (values %{$$pIdx{$selector}});
    }
  }

  # return unsuccessfully
  return ();
}


# find first unique identifier for extra read
#
# INTERFACE
# - argument 1: read type (clone & primer letter)
#               equivalent to read ID minus final number (and chemistry)
#
# - options beside object options:
#   -number     offset for read number, default:
#               2  for standard batch sequencing for primers r, s, t.
#                  number 1 is reserved for standard batch sequencing
#               1  for any other primer
#
# - return val: next unique identifier (number >= 2)
#
sub FirstExtra {
  my ($this, $ReadType, %opt) = @_;
  my ($class, $debug, $dbg2);
  my ($ReadNum);

  # function parameters
  $class = ref ($this);
  $debug = $this->{switch}{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $ReadNum = $opt{-number} ||
    ((substr ($ReadType, -1, 1) =~ m/[rst]/) ? 2 : 1);

  # check for existing read entries as fast as possible using available
  # master index
  while ($this->Retrieve ('ExperSub', $ReadType.$ReadNum)) {
    $ReadNum ++;
    next;
  }
  $debug and printf STDERR "%s. $ReadType -> $ReadType$ReadNum\n", (caller(0))[3];

  # return identifier
  return $ReadType.$ReadNum;
}


1;
# $Id: ReadIndex.pm,v 1.13 2018/06/05 18:02:56 szafrans Exp $
