################################################################################
#
#  kPerl Dictyostelium ReadWatch Concept
#  Library for Management of a Set of Sequence Clusters
#
#  copyright (c)
#    Institute of Molecular Biotechnology Jena, Dept. Genome Analysis, 2000
#  author
#    Karol Szafranski, karol.szafranski@leibniz-fli.de
#
################################################################################
#
#  DESCRIPTION
#
# - this library code provides an interface between the AlnK Suite and the
#   Dictyostelium ReadWatch Concept.
#
# - individual description of functions at the beginning of the code blocks
#
################################################################################
#
#  FUNCTIONS, DATA
#
#   @EXPORT
#   %_LibParam
#
# - data management
#   $_LibParam{*}
#   &_LoadRc
#
# - handle information
#   &ClusterAlns
#   &ClusterStat - calling hierarchy:
#     - &ReadWatch::Cluster::_LoadRc
#     - ? &ReadWatch::Cluster::ClusterStatCalc
#       - &ReadWatch::Cluster::ClusterAlns
#       - &ReadWatch::Library::TgtspecifLibgrpRegard
#       - sample counts in reference DB
#         - &ReadWatch::Library::TgtspecifLibgrpNum
#           - ? ...
#       - sample counts in alignment
#         - Align.pl -script
#
# - calculate
#   &ClusterStatCalc
#
#
#  STD OPTIONS
#
#   -debug      print debug protocol to STDERR
#   -rc         path of cluster info resource file (default: $_LibParam{default}{PathRc})
#
################################################################################

package ReadWatch::Cluster;

# includes
#use strict; use warnings;  # OK 2003xxxx
use MainLib::Data;
use MainLib::Path qw(%CorePath);
use MainLib::File;
use MainLib::Misc qw(&MySub);
use Math::kCalc qw(&Sum);
use ReadWatch::Library;

# symbol export
use Exporter qw(import);
our @EXPORT = qw (
  &ClusterAlns &ClusterStat
  &ClusterStatCalc
  );

# package-wide constants and variables
my %_LibParam;


################################################################################
# data management
################################################################################

%_LibParam = (
  default => {
    PathRc => $CorePath{ReadWatch}{home} .'/'. $CorePath{ReadWatch}{ClusterRc},
    },
  PathRc  => undef,
  data    => undef,
  );


# load cluster info resource
#
# INTERFACE
# - options:
#   -debug      [STD]
#   -rc         [STD]
#
# - return val: success status (boolean)
#
sub _LoadRc {
  my (%opt) = @_;
  my ($debug);
  my ($PathRc);

  # function parameters
  $debug = $opt{-debug};
  $PathRc = $opt{-rc} || $_LibParam{default}{PathRc};

  # rc file data present
  if ($_LibParam{data} and
      $PathRc eq $_LibParam{PathRc}) {
    return 1;
  }

  # read target specificity rc file
  unless (%{ $_LibParam{data} = &DataRead($PathRc)||{} }) {
    $debug and printf STDERR "ReadWatch::Cluster::_LoadRc. ERROR: unable to load rc file %s\n", $PathRc||"''";
    return undef;
  }
  $_LibParam{PathRc} = $PathRc;

  # exit SUB successfully
  return 1;
}


################################################################################
# handle information
################################################################################


# alignments contributing to a cluster project
#
# INTERFACE
# - argument 1: cluster ID
#
# - options:
#   -debug      [STD]
#   -rc         [STD]
#   -SlcHoriz   select for horizontally arranged alignments
#   -SlcVert    select for vertically arranged alignments
#
# - return val: array of alignment file paths
#
sub ClusterAlns {
  my ($ClusterID, %opt) = @_;
  my ($debug);
  my (@cluster);

  # function parameters
  $debug = $opt{-debug};
  &_LoadRc(%opt) or return ();
  unless (exists $_LibParam{data}{$ClusterID}) {
    $debug and printf STDERR "%s. ERROR: there's no entry '%s' in cluster library\n", &MySub, $ClusterID;
    return ();
  }

  # array of alignment file paths
  if ($opt{-SlcHoriz} or ! (grep { m/^-Slc/ } keys %opt)) {
    push @cluster, @{$_LibParam{data}{$ClusterID}{AlnFileHoriz}};
  }
  if ($opt{-SlcVert} or ! (grep { m/^-Slc/ } keys %opt)) {
    push @cluster, @{$_LibParam{data}{$ClusterID}{AlnFileVert}};
  }
  return @cluster;
}


# return table with cluster project statistics
#
# INTERFACE
# - argument 1:   cluster ID
#
# - options:
#   -debug        [STD]
#   -rc           [STD]
#   -RcTgtSpecif  target specificty rc file (default in ReadWatch::Library)
#   -update       use value instead of $RcParam{$ClusterID}{NumTimeOutdate}
#                 to decide whether the statistics have to be updated
#
# - return val:   - data reference (table type HIH)
#                 - undef if an error occurred
#
sub ClusterStat {
  my ($ClusterID, %opt) = @_;
  my ($debug);
  my ($TimeOutdate, $TimeData);

  # function parameters
  $debug = $opt{-debug} || 1;
  &_LoadRc (%opt, -debug=>$debug) or return undef;
  unless (exists $_LibParam{data}{$ClusterID}) {
    printf STDERR "%s. WARNING: there's no library entry for cluster '%s'\n  trying to treat it as an AlnK project\n", &MySub, $ClusterID;
    return &ClusterStatCalc ($ClusterID, %opt);
  }

  # update calculation
  $TimeOutdate = $opt{-update} || $_LibParam{data}{$ClusterID}{StatTimeOutdate};
  # take the least recent time of access
  $TimeData = &ftime ($_LibParam{data}{$ClusterID}{StatFile});
  if (! (-e $_LibParam{data}{$ClusterID}{StatFile})) {
    $debug and printf STDERR "%s. don't find cluster statistics file\n", &MySub;
  } elsif ($TimeData + $TimeOutdate < time) {
    $debug and printf STDERR "%s. cluster statistics file is outdated:\n", &MySub;
    $debug and printf STDERR "  file %d\n  outdate %d\n  time %d\n",
      $TimeData, $TimeOutdate, time;
  } else {
    $debug and printf STDERR "%s. cluster statistics file exists in a recent version\n", &MySub;
    $debug and printf STDERR "  file %d\n  outdate %d\n  time %d\n",
      $TimeData, $TimeOutdate, time;
  }
  if ($TimeData + $TimeOutdate < time or
      ! -e $_LibParam{data}{$ClusterID}{StatFile}) {
    $debug and printf STDERR "%s. (re)calculating cluster statistics\n", &MySub;
    return &ClusterStatCalc ($ClusterID, %opt);
  }

  # calculate expected read number
  return &DataRead ($_LibParam{data}{$ClusterID}{StatFile});
}


################################################################################
# calculate
################################################################################


# (re)calculate cluster project statistics
#
# INTERFACE
# - argument 1:   cluster ID, may be
#                 1. ID of cluster library entry
#                 2. AlnK project specifier
#
# - options:
#   -debug        [STD]
#   -rc           [STD]
#   -RcTgtSpecif  target specificty rc file (default in ReadWatch::Library)
#   -update       use value instead of $RcParam{$ClusterID}{NumTimeOutdate}
#                 to decide whether the statistics have to be updated
#
# - return val:   success status (boolean)
#
sub ClusterStatCalc {
  my ($ArgCluster, %opt) = @_;
  my ($debug, $dbg2, @update, @RcTgtSpecif);
  my (@SpecCluster, %ClusterHoriz, @LibGroup);
  my (%call, $CurrCluster, $CurrLib, $CurrField, $SlcID, $buffer);
  my (%DataPrim, %DataFin);

  ##############################################################################
  # initial work

  # function parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  @update = exists ($opt{-update}) ?
    (-update=>$opt{-update}) : ();
  @RcTgtSpecif = exists ($opt{-RcTgtSpecif}) ?
    (-rc=>$opt{-RcTgtSpecif}) : ();
  &_LoadRc(%opt) or return undef;

  # derive cluster or set of clusters
  if (@SpecCluster = &ClusterAlns ($ArgCluster, %opt)) { 
    %ClusterHoriz = map { ($_ => 1) } &ClusterAlns ($ArgCluster, -SlcHoriz=>1, %opt);
    # got an array of project specifiers from library
    $debug and printf STDERR "%s. working on clusters:\n%s", &MySub,
      join ('', map { "$_\n" } @SpecCluster);
  } elsif (-r (split (',', $ArgCluster))[0]) {
    @SpecCluster = ($ArgCluster);
    %ClusterHoriz = map { ($_ => 1) } @SpecCluster;
    $debug and printf STDERR "%s. working on cluster %s", &MySub, $ArgCluster;
  } else {
    printf STDERR "ERROR: %s / %s is neither a project nor set of projects\n",
      $ArgCluster||"''", (split (',', $ArgCluster))[0]||"''";
    return undef;
  }
 
  # set of library groups
  @LibGroup = &TgtspecifLibgrpRegard ('cluster', @RcTgtSpecif);
  if (@LibGroup) {
    $debug and printf STDERR "%s. library groups: %s\n", &MySub,
      join (', ', @LibGroup);
  } else {
    printf STDERR "ERROR: got no list of sequence libraries\n";
    exit 1;
  }

  ##############################################################################
  # sample data from alignments contributing to cluster

  foreach $CurrCluster (@SpecCluster) {
    # sample data: loop over library groups
    foreach $CurrLib (@LibGroup) {

      # prepare AlnK call
      $debug and printf STDERR "%s. analysis of cluster %s\n", &MySub, $CurrCluster||"''";
      $bTimer and $time = &Sum((times)[0,2]);
      $call{pl} = "$CorePath{call}{AlnK} -noDB $CurrCluster";
      $SlcID = &TgtspecifLibgrpSelector ($CurrLib, 'Read', @RcTgtSpecif);
      $call{AlnK} = "param Report::Statistics::SelectID $SlcID\nreport statist\nprint ids\ndismiss\n";

      # call AlnK
      unless (&open2 (\*INPROC, \*OUTPROC, $call{pl})) {
        print STDERR "ERROR: unable to start AlnK process: $call\n";
        exit 1;
      }
      &WriteFile (\*OUTPROC, $call{AlnK});

      # parse statistics
      unless (($buffer=&ReadFile(\*INPROC)) =~ m/consensus length: (\d+)\n(ID selector.+\n)?alignment( selection)?: ([0-9,]+) entries \(([0-9,]+) nucleotides\)/m) {
        printf STDERR "ERROR: unable to parse data from AlnK process:\n%s",
          $buffer || ($call{AlnK}."\n");
        exit 1;
      }
      $DataPrim{$CurrCluster}{$CurrLib} = {
        AlnConsLen => $ClusterHoriz{$CurrCluster} ? $1 : 0,
        AlnSeq     => $4,
        AlnNt      => $5,
        };
      $buffer = $';
      foreach (keys %{$DataPrim{$CurrCluster}{$CurrLib}}) {
        $DataPrim{$CurrCluster}{$CurrLib}{$_} =~ s/,//g; 
      }

      # parse list of IDs
      # be careful not to treat this list as a numeric entry!
      @{$DataPrim{$CurrCluster}{$CurrLib}{id}} =
        grep { m/$SlcID/ } split (/\n/, $buffer); 
     
      if ($debug) {
        printf STDERR "%s. consensus length %d for proj %s / lib %s\n", &MySub,
          $DataPrim{$CurrCluster}{$CurrLib}{AlnConsLen}, $CurrCluster, $CurrLib;
        if (grep { ! $_ } @{$DataPrim{$CurrCluster}{$CurrLib}}{qw(AlnSeq AlnNt)}) { 
          printf STDERR "%s. missing data when parsing output:\n$buffer", &MySub;
        } else {
          printf STDERR "%s. parsed statistics for:\n  cluster: %s\n  library group: %s\n", &MySub,
            $CurrCluster, $CurrLib;
        }
      }
    }
  }
  
  ##############################################################################
  # final table fill-in and table calculations

  $CurrCluster = @SpecCluster[0];
  foreach $CurrLib (@LibGroup) {

    # summarize over clusters
    foreach $CurrField (keys %{$DataPrim{$CurrCluster}{$CurrLib}}) {
      ref($DataPrim{$CurrCluster}{$CurrLib}{$CurrField}) and next;
      $DataFin{$CurrLib}{$CurrField} =
        &Sum (map{@{$_||[]}} &DataTreeSlc (\%DataPrim, [['','all'],[$CurrLib],[$CurrField]], -debug=>$dbg2));
    }
    $DataFin{$CurrLib}{AlnSeqId} = { map{ ($_=>1) } map {@{$_||[]}} map {@{$_||[]}}
      &DataTreeSlc (\%DataPrim, [['','all'],[$CurrLib],['id']], -debug=>$dbg2)
      };
    $DataFin{$CurrLib}{AlnSeq} = int keys %{$DataFin{$CurrLib}{AlnSeqId}};
    delete $DataFin{$CurrLib}{AlnSeqId};

    # enter database statistics
    $DataFin{$CurrLib}{LibSeq}     = &TgtspecifLibgrpNum ($CurrLib, 'Read', @update, @RcTgtSpecif);
    $DataFin{$CurrLib}{LibNt}      = &TgtspecifLibgrpNum ($CurrLib, 'Nt', @update, @RcTgtSpecif);
    $DataFin{$CurrLib}{LibGenomic} = &TgtspecifLibgrpSpecif ($CurrLib, 'genome', 'Nt', @update, @RcTgtSpecif);
  }
  # don't use %DataPrim from now on!
  
  # summarize over libraries
  foreach $CurrField (keys %{$DataFin{$LibGroup[0]}}) {
    $DataFin{sum}{$CurrField} =
      &Sum (@{ &DataTreeSlc (\%DataFin, [['','all'],[$CurrField]], -debug=>$dbg2) });
  }
  $DataFin{sum}{AlnConsLen} = $DataFin{$LibGroup[0]}{AlnConsLen};
  delete $DataFin{sum}{LibGenomic};
  # extrapolate for genome
  foreach $CurrLib (@LibGroup) {
    $DataFin{$CurrLib}{AlnSeqRelGnm} = $DataFin{$CurrLib}{AlnSeq} / $DataFin{$CurrLib}{LibSeq} / $DataFin{$CurrLib}{LibGenomic};
    $DataFin{$CurrLib}{AlnNtRelGnm} = $DataFin{$CurrLib}{AlnNt}  / $DataFin{$CurrLib}{LibNt}  / $DataFin{$CurrLib}{LibGenomic};
    $DataFin{$CurrLib}{GenomeSize} = &TgtspecifTgtNum ('genome', 'Nt', @RcTgtSpecif);
    $DataFin{$CurrLib}{GenomeNt} = $DataFin{$CurrLib}{AlnNt} * &TgtspecifTgtNum ('genome', 'Nt', @update, @RcTgtSpecif) / $DataFin{$CurrLib}{LibNt} / $DataFin{$CurrLib}{LibGenomic};
    $DataFin{$CurrLib}{GenomeCopyNt} = $DataFin{$CurrLib}{GenomeNt} / $DataFin{$CurrLib}{AlnConsLen};
    $DataFin{$CurrLib}{GenomeCopySeq} = $DataFin{$CurrLib}{AlnSeq} /
      ($DataFin{$CurrLib}{AlnConsLen} / $DataFin{$CurrLib}{GenomeSize} * $DataFin{$CurrLib}{LibSeq} / $DataFin{$CurrLib}{LibGenomic});
  }

  # debug
  push @LibGroup, 'sum';
  if ($debug) {
    &DataPrint (\%DataPrim, -handle=>\*STDERR);
    printf STDERR "%s\n", '-' x 80;
    &DataPrint (\%DataFin, -handle=>\*STDERR);
    printf STDERR "%s\n", '-' x 80;
  }

  # save table, return data
  if (%DataFin and $ArgCluster ne $SpecCluster[0]) {
    require FileHandle;
    my ($hOutRc);
    my $FileRc = $_LibParam{data}{$ArgCluster}{StatFile};
    if ($hOutRc = FileHandle->new($FileRc,'w')) {
      $debug and printf STDERR "%s. saving rc data back to file %s\n", &MySub, $FileRc||"''";
    } else {
      printf STDERR "%s. ERROR: unable to write file %s\n", &MySub, $FileRc;
    }
    &DataPrint (\%DataFin, -handle=>$hOutRc, -NoAddr=>1);
  }
  return \%DataFin;
}


1;
# $Id: Cluster.pm,v 1.13 2018/06/05 18:02:56 szafrans Exp $
