################################################################################
#
#  kPerl Sequence Laboratory
#  Object Library for a Suffix Trie to Store Biopolymer Sequences
#
#  copyright (c)
#  Karol Szafranski on behalf of IMB Jena, Genome Analysis, 1999-2001,2004
#  szafrans@imb-jena.de
#  Karol Szafranski at UPenn Philadelphia, Center for Bioinformatics, 2004-2005
#  karol@pcbi.upenn.edu
#  Karol Szafranski on behalf of IMB Jena, Genome Analysis, 2005
#  szafrans@imb-jena.de
#
################################################################################
#
#  DESCRIPTION
#
# - purpose
#   The algorithmic architecture and data access interface follow the
#   philosophy that the stored sequence tuples (subtrings) have a limited size
#   range, refelected by the object attributes TupleMin and TupleMax.
#
# - strandedness of a sequence is given special attention, see detailed
#   description in the document <release>/DocExample/SeqStrands.txt.
#
# - individual description of functions at the beginning of the code blocks
#
################################################################################
#
#  OBJECT OPERATORS
#
#   *** NONE ***
#
#
#  OBJECT METHODS  for external access
#
# - housekeeping
#   new           create object, initialise via ini(@args)
#   ini           initialise existing object. Options that can be set only via
#                 new() or ini(), not permanently stored in $this->{switch}:
#                 -load      initialise from data structure file
#                 -symbol    reference to array of sequence symbols, cmp.
#                            -TrashSmb. The setting is permanently stored in
#                            $this->{alph}{symbol}
#                 -TrashSmb  trash symbol = "unknown" in sequence alphabet
#                            The setting is permanently stored in
#                            $this->{alph}{TrashSmb}
#   AddSwitch     render object options
#   alph          query sequence alphabet
#   alph_ok       query sequence alphabet, including trash symbol
#
# - data management and access
#   NodeLbl       return array of data labels in (1st available) tuple data
#                 structure
#   NodeIterat    low-level interface for iteration over suffix trie
#   LevelTransit  add transition frequency calculi to nodes of a suffix trie
#                 level
#   LevelIndex    return hash representation of a suffix trie level:
#                 key = tuple string, value = reference to node data (hash)
#   LevelSum      return count sum of a suffix trie level
#
# - data I/O
#   Load          ini(-load=>$file) is the recommended interface
#   DataPrint
#   LevelPrint
#
# - analysis & evaluation
#   EnterSlice    enter tuple string into suffix trie
#   SeqScan       scan full sequence string and add tuple counts in suffix trie
#   NodeData      return data of a trie node (tuple)
#
#
#  OBJECT DATA STRUCTURE
#  (hash)
#
#   switch       hash reference for object switches, cf. &ini
#     -debug       print debug protocol to STDERR
#     -purify      remove all gaps from input sequences prior to analysis
#     -strands     0   process both strands as a unit (default)
#                      forward and reverse strand tuples of a dsDNA slice will
#                      be linked to the same single data node
#                  -1  process reverse-complement strand only
#                  1   process sense strand only
#                  2   process both strands independently
#     -TupleMin    minimum tuple size, default 1
#     -TupleMax    maximum tuple size, default 6
#     -upper       force sequence string input to upper case letters, default: 1
#   alph         sequence alphabet
#     symbol     reference to array of sequence symbols, cmp.
#                $this->{alph}{TrashSmb}.
#     TrashSmb   trash symbol = "unknown" in sequence alphabet
#   SuffixTrie   data container root
#                Tuples are organized in a tree of nodes, each node represented
#                by a hash containing symbol entries referring to the following
#                subtree. Additionally, a node contains an entry 'data', hash
#                of structure:
#                  ct    tuple count
#                  freq  virtual! depending on 'ct': tuple frequency
#   trash        reference to tuple trash can
#                each object has its own trash can, according to the setting of
#                the sequence alphabet
#   LevelIndex   tuple data index meant to serve as a memory to method
#                LevelIndex(). The index is an array representing all levels
#                in [TupleMin..TupleMax]. Each level is represented by a hash of
#                tuples referring to the corresponding data entries of the
#                suffix trie level.
#   LevelSum     tuple instance sums serving as a memory to method LevelSum().
#                The structure is an array keeping site sum values for each
#                level.
#
################################################################################
#
#  FUNCTIONS, DATA
#
#   @ISA
#   %_LibParam
#
# - object housekeeping
#   $_LibParam{default}
#   &new  see MainLib::DefaultObjHash.pm
#   &ini
#   &AddSwitch
#   &_LocalSwitch  see MainLib::DefaultObjHash.pm
#   &_DefTrash
#   &alph
#   &alph_ok
#
# - data management and access
#   &_NewNode
#   &_Get1stNode
#   &_GetNode
#   &NodeLbl
#   &_NodeLinkRevcompl
#   &NodeIterat
#   &_NodeSpiral
#   &_LevelValid
#   &LevelTransit
#   &LevelIndex
#   &LevelSum
#
# - data I/O
#   &Load
#   $_LibParam{DataPrintNot}
#   &DataPrint
#   &LevelPrint
#
# - analysis & evaluation
#   &EnterSlice
#   &SeqScan
#   &NodeData
#
################################################################################
#
#  DEBUG, CHANGES, ADDITIONS
#
# - a 'data' entry is generated in the basic node at $this->{SuffixTrie}.
#   This doesn't make sense.
#
# - look also for notes in the header of each function block
#
################################################################################

package SeqLab::SuffixTrie;

# includes
#use strict; use warnings;  # OK 20040715
use MainLib::Data;
use MainLib::DefaultObjHash;
use MainLib::StrRegexp qw(&TimeStr);
use MainLib::Misc qw(&MySub);
use Math::Calc;
use SeqLab::SeqBench;

# inheritance
our @ISA;
push @ISA, qw(MainLib::DefaultObjHash);

# package-wide constants and variables
my %_LibParam;


################################################################################
# object housekeeping
################################################################################


# default object data
$_LibParam{default} = {
  switch => {
    -strands  => 0,
    -purify   => undef,
    -upper    => 1,
    -TupleMin => 1,
    -TupleMax => 6,
    },
  alph => {
    symbol   => [ qw(A C G T) ],
    TrashSmb => 'N',
    },
  trash => { },
  };

# parametric initialisation
#
# INTERFACE
# - argument 1*: hash of switches
# - options:     all instance switches may be set here locally. List of
#                function-specific switches:
#   -load        load data from file
# - return val:  object reference
#
# DESCRIPTION
# - note that method new() will create a new empty object, and ini() may
#   also be used to initialise an object that already exists.
# - this function will be called by method new()
#
sub ini {
  my ($this, %opt) = @_;

  # initialise tuple data structure
  # Always assign the default data structure. This definitely overrides all
  #   previous contents of %$this.
  %$this = %{ &DataClone($_LibParam{default}) };

  # options that function only via methods new() and ini()
  {
    # load data from file?
    if ($opt{-load}) {
      unless ($this->Load($opt{-load})) {
        printf STDERR "%s. ERROR in method Load, file %s\n", &MySub, $opt{-load}||"''";
        exit 1;
      }
      delete $opt{-load};
    }
    # alphabet symbols
    if (exists ($opt{-symbol})) {
      if ($opt{-symbol} and @{$opt{-symbol}}) {
        $this->{alph}{symbol} = [ @{$opt{-symbol}} ];
      }
      delete $opt{-symbol};
    }
    # trash symbols
    if (exists ($opt{-TrashSmb})) {
      if ($opt{-TrashSmb}) {
        if (length($opt{-TrashSmb}) != 1) {
          printf STDERR "%s. ERROR: illegal argument to option -TrashSmb: %s\n", &MySub, $opt{-TrashSmb};
          exit 1;
        }
        $this->{alph}{TrashSmb} = $opt{-TrashSmb};
      }
      delete $opt{-TrashSmb};
    }
  }

  # enter object switches
  unless ($this->AddSwitch(%opt)) {
    printf STDERR "%s. ERROR in method AddSwitch\n", &MySub;
  }

  # initialise juvenile tuple data structure
  # - trash can
  # - tuple data structure root node
  unless (values %{$this->{trash}}) { $this->_DefTrash(); }
  unless (exists($this->{SuffixTrie}) and values %{$this->{SuffixTrie}}) {
    $this->{SuffixTrie} = $this->_NewNode();
  }
  $this->{switch}{-debug} and printf STDERR "%s. symbols after initialisation: {%s} unk %s\n", &MySub,
    join(',',$this->alph()), $this->{alph}{TrashSmb};

  # return object
  return $this;
}


# enter object switches
#
# INTERFACE
# - argument 1*: hash of object options
# - return val:  success status (boolean)
#
sub AddSwitch {
  my ($this, %oopt) = @_;

  # loop over switches
  while (my($key,$val) = each(%oopt)) {

    # options badly placed at this interface
    if ($key=~m/^-(load|symbol|TrashSmb)$/ and exists($this->{SuffixTrie}) and %{$this->{SuffixTrie}}) {
      printf STDERR "%s. WARNING: option -%s will function only via methods new() and ini()\n", &MySub, $1;
      next;
    }

    # process option -strands
    if ($key eq '-strands') {
      unless (defined $val) { next }
      if ($val<-1 or $val>2) {
        printf STDERR "%s. ERROR: illegal argument to option -strands: %s\n", &MySub, $val||"''";
        exit 1;
      }
      # enter later value on
    }

    #options that we just have to enter
    if (defined $val) { $this->{switch}{$key} = $val; }
    else       { delete $this->{switch}{$key}; }
  }

  # return success status
  return 1;
}


# define trash can according to object switches
#
# DESCRIPTION
# - the "tuple trash can" works like /dev/null
# - initialisation of data root with trash symbol causes problems if we
#   later redefine the trash symbol. Once defined, the trash can needs to be
#   conserved over the full life time of the object.
#
sub _DefTrash {
  my ($this) = @_;
  my ($key, $val);

  # suffix trie shall not contain more than default data
  if (exists($this->{SuffixTrie}) and %{$this->{SuffixTrie}}) {
    printf STDERR "%s. ERROR: cannot (re-)define trash on filled object\n", &MySub;
    exit 1;
  }

  # construct trash can according to object settings, cmp. switches -symbol
  # and -TrashSmb processed in ini()
  $this->{trash} ||= {};
  # all ways lead into trash
  foreach ($this->alph_ok()) {
    $this->{trash}{$_} = $this->{trash};
  }
}


# return sequence alphabet (array of symbols)
#
sub alph {
  return @{$_[0]->{alph}{symbol}};
}

# return sequence alphabet (array of symbols)
#
sub alph_ok {
  return (@{$_[0]->{alph}{symbol}}, $this->{alph}{TrashSmb});
}


################################################################################
# data management and access
################################################################################


# new node, simply referencing trash can via trash symbol
#
sub _NewNode {
  my ($this) = @_;

  # delete memory data which are obsolete now
  delete $this->{LevelIndex};

  # return new suffix trie node
  return { $this->{alph}{TrashSmb}=>$this->{trash} };
}


# print level of suffix trie as a table
#
# INTERFACE
# - options:    all instance switches may be set here locally. List of
#               function-specific switches:
#   -expand     add labels to list, that may be computed in interface methods
#               like PrintLevel()
#
# - return val: array of column labels
#
sub NodeLbl {
  my %expand = {
    ct => [ qw(freq) ],
    };
  my ($this, %opt) = @_;
  my ($pNode, @col, %ColIdx);

  # primary list from first available data node
  $pNode = $this->_Get1stNode();
  @col = keys %{$$pNode{data}};

  # refine/expand list
  if ($opt{-expand}) {
    push @col, map { $expand{$_} ? @{$expand{$_}}:() } @col;
  }
  %ColIdx = map { ($_=>1) } @col;
  @col = keys %ColIdx;

  return @col;
}


# get reference to any valid data node
#
# INTERFACE
# - return val: reference to tuple node data structure
#
sub _Get1stNode {
  my ($this) = @_;
  my ($pRoot, $pNode);

  # starting point of suffix trie data structure
  $pRoot = $this->{SuffixTrie};

  # find first valid node
  $pNode = ( map { exists($$pRoot{$_}) ? $$pRoot{$_} : () }
    @{$this->{alph}{symbol}} )[0];
  return $pNode;
}


# get reference to data node
#
# INTERFACE
# - argument 1: sequence slice equivalent to target node
#
# - options:    all instance switches may be set here locally. List of
#               function-specific switches:
#   -BaseNode   reference to base node data structure
#   -LinkRev    ensure linkage between standard and reverse-complement
#               A node could be linked to the reverse-complement node
#               automatically if ($this->{switch}{-strands} == 0).
#               But, what happens if this is called by &_NodeLinkRevcompl?
#               => endless loop.
#
# - return val: reference to target tuple node data structure
#
# DESCRIPTION
# - if a node doesn't exist it'll be created.
#
sub _GetNode {
  my ($this, $slice, %opt) = @_;
  my ($pNode, $letter);

  # starting point of suffix trie data structure
  $pNode = $opt{-BaseNode} || $this->{SuffixTrie};

  # loop into target depth
  foreach $letter (split(//,$slice)) {

    # get onto next suffix trie level (ensure there's one)
    $pNode = $$pNode{$letter} ||= $this->_NewNode();
  }

  # ensure linking of the node?
  # either there's a 'data' field in the target node or it has to be linked
  if ($opt{-LinkRev}) {
    unless ($$pNode{data}) {
      $this->_NodeLinkRevcompl($pNode,$slice);
    }
  }

  # return tuple node reference
  return $pNode;
}


# link node data entry to its reverse complement
#
# INTERFACE
# - argument 1: reference to tuple node corresponding to tuple string
# - argument 2: tuple string
#
sub _NodeLinkRevcompl {
  my ($this, $pNode, $slice, %opt) = @_;
  my ($SliceRcpl, $pNodeRcpl, $pData);

  # get node for reverse-complement of sequence slice
  $SliceRcpl = &SeqStrRevcompl ($slice);
  $pNodeRcpl = $this->_GetNode($SliceRcpl);

  # get or create the data entry of the node(s)
  $pData = $$pNode{data} || $$pNodeRcpl{data} || {};

  # enter default structure for node
  foreach ($pNode, $pNodeRcpl) {
    unless (%$pNode) {
      %$_ = %{ $this->_NewNode()||{} };
    }
    $_->{data} = $pData;
  }

  # which one is redundant?
  if (($slice cmp $SliceRcpl) < 0) {
    $$pNodeRcpl{isRed} = 1;
  } elsif (($slice cmp $SliceRcpl) > 0) {
    $$pNode{isRed} = 1;
  }
}


# perform full recursion over branches of suffix trie
#
# INTERFACE
# Same as &_NodeSpiral, but omitting the arguments. As the arguments of
# &NodeSpiral are mostly useful during recursion only, this function is
# provided as an argument-less entry interface to &_NodeSpiral.
#
# DEVELOPER'S NOTE
# - we do not need to delete the memory data structure $this->{LevelIndex}
#   because it is just an index to the nodes, not a clone of the data structure.
#
sub NodeIterat {
  my ($this, %opt) = @_;
  return $this->_NodeSpiral ((undef) x 3, %opt);
}


# perform spiral loop (recursion) over branches of suffix trie
#
# INTERFACE
# As the arguments are mostly useful during recursion only, the function
# &NodeIterat is provided as an argument-less interface to &_NodeSpiral.
# - argument 1: level startup value, default: 0 = 'base level'
# - argument 2: refer to specified sequence slice, default: ''
# - argument 3: reference to trie (sub)structure entry node
#               default: trie root node
#
# - options:    all instance switches may be set here locally. List of
#               function-specific switches:
#   -LevelLimit branch up onto specified suffix trie level, default end: when
#               undefined node is reached
#   -onAll      perform referenced code on all levels. This works only in
#               conjunction with option -LevelLimit.
#   -onTop      perform referenced code only on top level
#
# DESCRIPTION
# - this function works with code references. The arguments of &_NodeSpiral
#   are exactly handed over to the called anonymous code (options -onAll and
#   -onTop).
#
sub _NodeSpiral {
  my ($this, $LevelNum, $slice, $pNode, %opt) = @_;
  my (%lopt, $debug);

  # function parameters
  %lopt = $this->_LocalSwitch(%opt);
  $debug = $lopt{-debug};

  # by default start at base of data structure
  unless ($LevelNum) {
    $pNode ||= $this->{SuffixTrie};
  }
  unless (defined($pNode) or $opt{-LevelLimit}) { return }
  $slice ||= '';
  $debug and printf STDERR "%s. I'm on level %d, slice %s\n", &MySub,
    $LevelNum, $slice;

  # actions
  if ($LevelNum) {

    # on all nodes
    if (ref($opt{-onAll}) eq 'CODE') {
      &{$opt{-onAll}} ($this, $LevelNum, $slice, $pNode, %opt);

      # reached branching limit?
      if ($LevelNum == $opt{-LevelLimit}) { return }
    }

    # on target node level
    if ($LevelNum == $opt{-LevelLimit} and ref($opt{-onTop}) eq 'CODE') {
      &{$opt{-onTop}} ($this, $LevelNum, $slice, $pNode, %opt);
      return;
    }
  }

  # branching further into target depth
  foreach my $letter ($this->alph()) {
    # get onto next suffix trie level
    if (exists($$pNode{$letter}) and %{$$pNode{$letter}}) {
      $this->_NodeSpiral ($LevelNum+1, $slice.$letter, $$pNode{$letter}, %opt);
    }
  }
}


# verify existence of a suffix trie level
#
# INTERFACE
# - argument 1: level value
#               level numbering syntax is biological (starting at tuple size 1)
# - return val: validity status (boolean)
#
sub _LevelValid {
  my ($this, $LevelNum) = @_;
  my $pOopt=$this->{switch};
  my $bLevelValid = ($LevelNum>=$pOopt->{-TupleMin} and $LevelNum<=$pOopt->{-TupleMax}) ? 1 : 0;
  if (! $bLevelValid) {
    printf STDERR "%s. ERROR: access to non-existing tree level: %d\n", &MySub,
      $LevelNum;
    printf STDERR map{ "  called by $_\n" }grep{ $_ }map{ (caller($_))[3] } 1,2,3;
  }
  return $bLevelValid;
}


# add transition frequency calculi to nodes of a suffix trie level
#
# INTERFACE
# - argument 1: level value
#               level numbering syntax is biological (starting at tuple size 1)
# - return val: object reference
#
# - object data
#   {SuffixTrie}...{data}{ctpsi}
#   {SuffixTrie}...{data}{transitfreq}
#
# DEVELOPER'S NOTE
# - note that script SeqMotif.pl also has an implementation
#   for calculating transition frequencies: function &TupleTransit.
# - we do not need to delete the memory data structure $this->{LevelIndex}
#   because it is just an index to the nodes, not a clone of the data structure.
#
sub LevelTransit {
  my ($this, $LevelNum, %opt) = @_;
  my ($debug);
  my ($LevelPrefix, $pActionTop);

  # function parameters
  $debug = $this->{switch}{-debug} || $opt{-debug};
  $this->_LevelValid($LevelNum) or return $this;
  $LevelPrefix = $LevelNum - 1;

  # do recursion over suffix trie data structure
  # define top-level action (at prefix level):
  # - force existence of suffix nodes, create local index on suffix nodes
  # - add attributes "ctpsi", "transitfreq"
  $pActionTop = sub {
    my ($this, undef, undef, $pNode) = @_;
    my (%IdxSuffix, $CtSum, $smb);
    foreach $smb ($this->alph()) {
      $IdxSuffix{$smb} = $this->_GetNode($smb,-BaseNode=>$pNode);
        # We rely on _GetNode()'s feature to create nodes that do not
        # already exist.
      $IdxSuffix{$smb}{data}{ctpsi} = $IdxSuffix{$smb}{data}{ct} + 0.5;
      $CtSum += $IdxSuffix{$smb}{data}{ctpsi};
    }
    foreach $smb ($this->alph()) {
      $IdxSuffix{$smb}{data}{transitfreq} = $IdxSuffix{$smb}{data}{ctpsi} / $CtSum;
    }
  };
  # exucute recursion top-level action (at prefix level):
  # - does it work if $LevelPrefix==0?
  $this->NodeIterat (-onTop=>$pActionTop, -LevelLimit=>$LevelPrefix, %opt);

  return $this;
}


# return tuple level index
#
# INTERFACE
# - argument 1: level value
#               level numbering syntax is biological (starting at tuple size 1)
# - return val: - reference to requested tuple level index
#                 stored as $this->{LevelIndex}[$LevelNum]
#               - undef if an error occurs
#
# - object data
#   {LevelIndex}[$LevelNum]
#
sub LevelIndex {
  my ($this, $LevelNum, %opt) = @_;
  my ($debug);

  # function parameters
  $debug = $this->{switch}{-debug} || $opt{-debug};
  $this->_LevelValid($LevelNum) or return undef;

  # need to (re-)index the level, or is data present?
  unless ($this->{LevelIndex}[$LevelNum]) {

    # define top-level action, do recursion over suffix trie data structure
    my $pActionTop = sub {
      my ($this, $LevelNum, $slice, $pNode) = @_;
      $this->{LevelIndex}[$LevelNum]{$slice} = $$pNode{data} || '0';
    };
    $this->NodeIterat (-onTop=>$pActionTop, -LevelLimit=>$LevelNum, %opt);
  }

  # index present and (always) up to date
  else {
    $debug and printf STDERR "%s. already have a tuple index\n", &MySub;
  }

  # return index
  return $this->{LevelIndex}[$LevelNum];
}


# determine total tuple sites for tuple level
#
# INTERFACE
# - argument 1: level value
#
# - return val: total tuple sites for tuple level
#
# - object data
#   {LevelIndex}[$LevelNum]
#   {LevelSum}[$LevelNum]
#
# DESCRIPTION
# - the aim is to determine the number of tuple sites in the underlying
#   sequence, not the count sum since the latter depends on the strand model.
# - the number of tuple sites should be
#     n(tuple) = nt(seq) - n(seq) * (nt(tuple)-1)
#   given that:
#   - sequences don't contain Ns, otherwise subtract:
#     n(N sites) * (nt(tuple)-1)
#   - nt(seq) always > nt(tuple), otherwise neglect these sequences
#
sub LevelSum {
  my ($this, $LevelNum, %opt) = @_;
  my ($pTupleNonred);

  # function parameters
  $this->_LevelValid($LevelNum) or return 0;

  # non-redundant array of tuples
  # this access also updates the tuple index
  if (!$this->{switch}{-strands} and !($LevelNum % 2)) {

    # double-stranded sequence, symmetric tuples are possible
    $pTupleNonred = &DataTreeSlc ($this->LevelIndex($LevelNum,%opt),
      [[undef,'all']], -unique=>1);

  } else {
    # - single-stranded sequence -> no redundancy at all
    # - double-stranded sequence, but odd sized tuples. The list is redundant
    #   by a factor of 2.0 (no symmetric tuples are possible). Later, we just
    #   recalculate the sum value.
    $pTupleNonred = [values %{ $this->LevelIndex($LevelNum,%opt)||{} }];
  }

  # calculate level sum value
  $this->{LevelSum}[$LevelNum] = &Sum (map { $_->{ct} } @$pTupleNonred);
  # double-stranded sequence, but odd sized tuples. The list is redundant
  # exactly by a factor of 2.0 (no symmetric tuples are possible). Now, we just
  # recalculate the sum value.
  if (!$this->{switch}{-strands} and ($LevelNum % 2)) {
    $this->{LevelSum}[$LevelNum] /= 2;
  }

  # return sum
  return $this->{LevelSum}[$LevelNum];
}


################################################################################
# data I/O
################################################################################


# load object data from file
#
# INTERFACE
# - argument 1: path of input file
# - options:    all instance switches may be set here locally.
# - return val: success status (boolean)
#
sub Load {
  my ($this, $PathData, %opt) = @_;
  my ($debug);
  my ($pData);

  # function parameters
  $debug = $this->{switch}{-debug} || $opt{-debug};

  # load data
  $pData = &DataRead ($PathData);
  $debug and printf STDERR "%s. %d entries in suffix trie root node: %s\n", &MySub,
    int(grep{ $_ }values %{$$pData{SuffixTrie}}),
    join(' ',grep{ $$pData{SuffixTrie}{$_} }keys %{$$pData{SuffixTrie}});
  %$this = %$pData;

  # procedure status
  return keys %$this;
}


$_LibParam{DataPrintNot} = [ qw(LevelIndex LevelSum) ];

# print data structure of suffix trie
#
# INTERFACE
# - options:    all instance switches may be set here locally. List of
#               function-specific switches:
#   -handle     output file handle, default: STDOUT
#
# DESCRIPTION
# - This is what is meant to be the OO data interface to
#   &MainLib::Data::DataPrintValue. If called by another function it behaves.
#   like &MainLib::Data::DataPrint on first entry.
#
sub DataPrint {
  my ($this, %opt) = @_;
  my ($hOut, $NumSpace);
  my (%ThisPrint);

  # function parameters
  $opt{-had} ||= { "$this"=>1 };
  $hOut = $opt{-handle} ||= \*STDOUT;
  $NumSpace = $opt{-space} ||= 0;
  if (! defined($opt{-SpaceIncr})) { $opt{-SpaceIncr} = 2 }

  # prepare printed object data
  %ThisPrint = %$this;
  foreach (@{$_LibParam{DataPrintNot}}) {
    delete $ThisPrint{$_};
  }

  # prepare printed object data
  if ((caller(1))[3] =~ m/DataPrintValue$/) {
    printf $hOut "%s%s{\n", ' ' x $NumSpace, $this;
    $opt{-space} += $opt{-SpaceIncr};
  } else {
    printf $hOut "%s__my_ref__  %s\n", ' ' x $NumSpace, $this;
  }
  &MainLib::Data::DataPrint (\%ThisPrint, -NoMyRef=>1, %opt);
}


# print level of suffix trie as a table
#
# INTERFACE
# - argument 1: level number
#
# - options:    all instance switches may be set here locally. List of
#               function-specific switches:
#   -action     node action (code reference), argument interface provided by
#               framing code:
#                 arg1     tuple string
#                 arg2     reference to node data
#                 %opt:
#                   -LevelNum  tuple size (level of suffix trie)
#                   -sum       sum of tuple counts, pre-computed by &LevelPrint
#                   -tsize     LevelNum
#               function is expected to output some of the node data
#   -column     column labels, state in conjunction with &action
#   -handle     output file handle, default: STDOUT
#   -TabLabel   label of the table (short phrase), printed into first line of
#               the table header
#
sub LevelPrint {
  my ($this, $LevelNum, %opt) = @_;
  my (%lopt, $debug, $hOut);
  my (@column, $TabLabel);

  # function parameters
  %lopt = $this->_LocalSwitch(%opt);
  $debug = $lopt{-debug};
  $hOut = $opt{-handle} ||= \*STDOUT;

  # print table header
  $TabLabel = $opt{-TabLabel} || 'tuple frequency table';
  @column = $lopt{-column} ? @{$lopt{-column}} : qw(tuple ct freq);
  printf $hOut "# %s - %s\n", join ('::', (split('::',__PACKAGE__))[-2,-1]), $TabLabel;
  printf $hOut "# suffix trie level, tuple size %d\n", $LevelNum;
  printf $hOut "# alphabet: {%s}, unk %s\n", join(',',$this->alph()), $this->{alph}{TrashSmb};
  printf $hOut "# options: %s\n", join (', ', map { "$_ $lopt{$_}" }
    grep { defined($lopt{$_}) and !m/^-(action|handle|Tuple(Min|Max))/ } keys %lopt);
  printf $hOut "# date/time: %s\n", &TimeStr();
  printf $hOut "#\n# column labels:\n# %s\n", join("\t",@column);

  # print table
  my $pActionNode = $lopt{-action} || sub {
    my ($TupleVal, $pTupleDat, %opt) = @_;
    printf $hOut "%s\t%d\t%s\n", $TupleVal,
      $$pTupleDat{ct},
      $$pTupleDat{ct} / $opt{-sum};
    };
  # need level sum value
  # $this->LevelSum() also updates the tuple index
  my %sopt = %opt;
  $sopt{-sum} = $this->LevelSum($LevelNum) or return;
  $sopt{-LevelNum} = $sopt{-tsize} = $LevelNum;
  # iterate through nodes of the suffix trie, act for node output
  foreach my $ItTuple (sort keys %{$this->{LevelIndex}[$LevelNum]}) {
    &$pActionNode ($ItTuple, $this->{LevelIndex}[$LevelNum]{$ItTuple}, %sopt);
  }
}


################################################################################
# analysis & evaluation
################################################################################


# for a sequence slice: add all tuple levels to data structure
#
# INTERFACE
# - argument 1: sequence slice
#
# - options:
#   -action     node action (code reference), argument interface provided by
#               framing code:
#                 arg1     reference to node data
#               function is expected to modify the node data structure
#   -pos        actual sequence position of the sequence slice
#               This option does not take effect on a standard call, but it
#               might be exploited by a customised node action (cmp. option
#               -action). NOTE: Position values are in biological notation.
#
# DESCRIPTION
# - for recursing the suffix trie we don't use &_GetNode here because we
#   are able to successively get one step forward into a subtree
#   when coming from the current node.
# - strandedness setting (option -strands) is dealt with here only in stranded-
#   ness mode 0 = "both strands as a unit". In that case, the function will link
#   forward and reverse-complement tuples to one single data node.
#
sub EnterSlice {
  my ($this, $slice, %opt) = @_;
  my (%lopt, $pActionNode);
  my ($letter, $CtSize, $pNode, $SliceDone);

  # function parameters
  %lopt = $this->_LocalSwitch(%opt);
  $lopt{-purify} and $slice = &SeqStrPure ($slice);
  $lopt{-upper} and $slice =~ tr/a-z/A-Z/;
  $pActionNode = $lopt{-action} || sub {
    my ($pNodeDat) = @_;
    $$pNodeDat{ct} ++;
    };

  # start at base of data structure
  $pNode = $this->{SuffixTrie};

  # loop into target depth
  # we don't use &_GetNode here because we successively chain into the next
  #   subtree, at every node we change the data
  foreach $letter (split (//,$slice)) {
    $CtSize ++;

    # get onto current suffix trie level (ensure there's one)
    $$pNode{$letter} ||= $this->_NewNode();
    $pNode = $$pNode{$letter};
    $SliceDone .= $letter;

    # for double-stranded sequences: link tuple to its reverse-complement
    if (!exists($$pNode{data}) and !$lopt{-strands}) {
      $this->_NodeLinkRevcompl($pNode,$SliceDone);
    }
    if ($CtSize < $lopt{-TupleMin}) { next }

    # node action, default: rise tuple count
    $$pNode{data} ||= {};
    &$pActionNode ($$pNode{data},%opt);
  }

  # delete memory data which are obsolete now
  delete $this->{LevelSum};
}


# scan sequence for slices, enter subsequences into suffix trie data structure
#
# INTERFACE
# - argument 1: sequence string
#
# - options:
#   [same as &EnterSlice]
#
sub SeqScan {
  my ($this, $seq, %opt) = @_;
  my (%lopt, $debug);
  my ($CtPos, $slice);

  # function parameters
  %lopt = $this->_LocalSwitch(%opt);
  $debug = $lopt{-debug};
  $lopt{-purify} and $seq = &SeqStrPure ($seq);
  $lopt{-upper} and $seq =~ tr/a-z/A-Z/;
  $debug and printf STDERR "%s. tuple size range: %d..%d\n", &MySub,
    $lopt{-TupleMin}, $lopt{-TupleMax};
  $debug and printf STDERR "%s. strand model %d\n", &MySub, $lopt{-strands};

  # replace unknown symbols
  my $TrashSmb = $this->{alph}{TrashSmb};
  my $symbol = join ('', $this->alph_ok());  # alphabet, including trash symbol
  $seq =~ s/[^$symbol]+/$TrashSmb/g;

  # scan for slices at all sequence positions
  while ($slice = substr ($seq, $CtPos, $lopt{-TupleMax})) {
    $CtPos ++;

    # enter slice into suffix trie
    if ($lopt{-strands}>=0) {
      $this->EnterSlice ($slice, -pos=>$CtPos, %opt);
    }
    if ($lopt{-strands}==-1 or $lopt{-strands}==2) {
      $this->EnterSlice (&SeqStrRevcompl($slice), -pos=>$CtPos, %opt);
    }
  }

  # delete data structures which are obsolete now
  # done in &EnterSlice
}


# return count of a tuple
#
# INTERFACE
# - argument 1: sequence slice
# - return val: reference to suffix trie node data structure
#
sub NodeData {
  my ($this, $slice) = @_;
  my ($pNode);

  # get node data structure
  $pNode = $this->_GetNode ($slice, -LinkRev=>(!$this->{switch}{-strands}));

  # return count
  return $$pNode{data};
}


1;
# $Id: SuffixTrie.pm,v 1.18 2005/05/03 15:02:55 szafrans Exp $
