#! /usr/local/bin/perl
################################################################################
#
#  Phylogenetic Analysis
#
#  copyright (c)
#  Karol Szafranski on behalf of IMB Jena, Dept. Genome Analysis, 2000-2002,
#    szafrans@imb-jena.de
#
################################################################################
#
#  DESCRIPTION
#
# - See function &usage for description of command line syntax
#
# - each function comes along with a description at the beginning of the code
#   block
#
################################################################################
#
#  FUNCTIONS, DATA
#
# - MAIN
#   %GlobStore
#   $ProgFile,$ProgFstump
#   %ProgParam
#   $ProgMode,%ProgOpt,@ProgArg
#
# - usage help, command line arguments, basic I/O
#   &usage
#   &AddSwitch
#   &PrepOstump
#
# - analysis and presentation
#   &ProgTreeImg
#
# - simulations
#   &ProgAlnSimulPhylBatch
#   &ProgAlnSimulPhyl
#   &ProgAlnSimulRand
#   &ProgShotgunPolymorph
#
################################################################################
#
#  DEBUG, CHANGES, ADDITIONS
#
# - look also for notes in the header of each function block
#
################################################################################


# global constants and variables
# also used by modules as a unique global data anchor
our %GlobStore;

# include path(s), includes
BEGIN {
  unshift @INC, grep{$_} split(/:+/,$ENV{KPERLPATH}||$ENV{PERLPATH}||'');
}
#use strict; use warnings;  # OK 20040813
use MainLib::StrRegexp qw(&TimeStr);
use MainLib::Data;
use MainLib::Path;
use MainLib::Cmdline qw(&GetoptsNArgs &QueryConfirm);
use MainLib::File;
use MainLib::Misc;
use Math::Calc;
use Math::Round qw(&nearest);
use Math::Random;
use Math::Statist qw(&SampleMetrics);
use database::Table qw(&TableConvert);
use SeqLab::Align;
use SeqPhylog::TreeFormat;


# script ID
# - program name as specified on the command line
our $ProgFile = ( split('/',__FILE__) )[-1];
our $ProgFstump=$ProgFile; $ProgFstump=~s/\.\w{1,4}$//;

# global constants (esp. default values)
our %ProgParam;
$ProgParam{shotgun}{overlap} = 45;
$ProgParam{default}{OutImgWidth} = 640;
$ProgParam{default}{ProgMode} = 'NONE';


# manage I/O #####################################

# organise I/O handles
&Unbuffer();


# command line interface #########################
# NOTE:
# - &GetoptsNArgs (via &AddSwitch) modifies global variables:
#   $ProgMode @ProgArg %ProgOpt
# - @ProgArg may be pre-filled in &GetoptsNArgs via &AddSwitch (option -fofn)

# arguments, switches, default subprogram
our $ProgMode = undef;
our @ProgArg = ();
our %ProgOpt = ();
@ProgArg = &GetoptsNArgs();
$ProgMode ||= $ProgParam{default}{ProgMode};

# eventually open LOG file
if ($ProgOpt{-log}) {
  $ProgOpt{LogFile} = ($ProgOpt{-log}!=1) ? $ProgOpt{-log} : undef;
  &LogOpen (-file=>$ProgOpt{LogFile}, -stamp=>$ProgFstump, -prog=>"$ProgFile -$ProgMode");
}
END {
  $ProgOpt{-log} and &LogClose();
}


# work flow manifold #############################

# chain to program mode without input argument(s)
if (0) { }
elsif (!@ARGV or $ProgMode=~m/^h(elp)?$/i) { &usage() }

# ensure input argument(s)
unless (@ProgArg) {
  print STDERR "ERROR: input arguments missing\n";
  exit 1;
}

# chain to program mode (with input argument(s))
if (0) { }
elsif ($ProgMode =~ m/^AlnSimulPhyl$/i) {
  &ProgAlnSimulPhyl ($ProgArg[0]);
}
elsif ($ProgMode =~ m/^AlnSimulPhylBatch$/i) {
  &ProgAlnSimulPhylBatch ($ProgArg[0]);
}
elsif ($ProgMode =~ m/^AlnSimulRand$/i) {
  &ProgAlnSimulRand ($ProgArg[0]);
}
elsif ($ProgMode =~ m/^ShotgunPolymorph$/i) {
  my @RedoBatch = $ProgOpt{-RedoSize} ? (1 .. $ProgOpt{-RedoSize}) : (undef);
  foreach my $CtRedo (@RedoBatch) {
    &ProgShotgunPolymorph ($ProgArg[0], $CtRedo);
  }
}
elsif ($ProgMode =~ m/^TreeImg$/i) {
  &ProgTreeImg ($ProgArg[0]);
}
else {
  print STDERR "ERROR: unknown program mode or switch '$ProgMode'\n";
  exit 1;
}

# exit script successfully
# cf. END blocks!
exit 0;


################################################################################
# usage help, command line arguments, basic I/O
################################################################################


sub usage {
  print "\n";
  print <<END_USAGE;
DESCRIPTION
 $ProgFile is desired for phylogenetic simulations and analyses.

COMMAND LINE SYNTAX
 $ProgFile  -<ModeSwitch> [-<OptionalSwitch> ...] <Arg1> [<Arg2> ...]

Arguments
---------
 See function descriptions.

path arguments:
 Relative paths will be resolved according to the pwd. Prefixes "~" and "~uid"
 are resolved to the home directories. Path "-" resolves to STDIN or STDOUT,
 depending on the context.

ModeSwitch (case-insensitive)
-----------------------------
<none>            default ModeSwitch -$ProgParam{default}{ProgMode} if program arguments are given.
                  Otherwise like ModeSwitch -help.
-h(elp)           output command line syntax description and exit
-AlnSimulPhyl     simulate sequence alignment corresponding to a
                  specified phylogenetic tree
                  Arg1        tree file (Newick format)
                              distances are thought to be scaled in time
-AlnSimulPhylBatch
                  do batch of -AlnSimulPhyl
                  Arg1        tree file template (Newick format)
                  -SampleSize width of simulated alignment
                  -StepSize   value of mutator clock tick
-AlnSimulRand     simulate sequence alignment randomly
                  Arg1        parameter rc file, example:
                              $CorePath{call}{MeInstDoc}/PhylonAlnSimulRand.rc
-ShotgunPolymorph Monte Carlo simulation of accumulation of polymorphism
                  data in shotgun sequencing
                  Arg1        parameter rc file, example:
                              $CorePath{call}{MeInstDoc}/PhylonShotgunPolymorph.rc
                  -RedoSize   perform array of experiments, use countig suffix
                              for output filenames
-TreeImg          prepare image for tree file
                  Arg1        tree file (Newick format)
                              distances are thought to be scaled in time

OptionalSwitch (case-insensitive)
---------------------------------
switch argument types: B:=boolean, F:=floating point/scientific, N:=integer,
S:=string, X:=varying type.

-debug(=N)        print debug protocol to STDOUT/STDERR. A debug depth value
                  may be specified via switch parameter N in syntax '-debug=N'.
-log(=S)          redirect STDOUT and STDERR to LOG file
                  S  specify LOG output path. By default, filename and
                     directory are worked out automatically from built-in
                     directives.
-OutDir=S         directory for multi-file output. This switch overrides any
                  directory statement provided with switch -OutStump.
-OutImg(=S)       force program to produce image output and (optionally)
                  specify output path
-OutImgRelhigh=F  define the output image's height in relation to its width
-OutImgTransp     turn image's background transparent
-OutImgWidth=N    define the output image's pixel width
-OutStump=S       path stump for multi-file output. A default is derived from
                  input file names in most cases.
-RedoSize=N       redo batch size for simulations
-SampleSize=N     specify sample size for resampling etc.
-StepSize=N|F     specify step size for batch calculation etc.
-v(erbose)        print extended action protocol to STDOUT
-WinSize=N|F      specify window size for smoothening etc.

Environment Variables
---------------------
 \$PERLPATH        primary search path for Perl package look-up
END_USAGE
  print "\n";
  exit 0;
}


# add program switches to global table (hash)
#
# INTERFACE
# - argument 1:  switch argument without leading '-'
#
# - global data:
#   $ProgMode
#   %ProgOpt  switch data which gets processed here
#
# DESCRIPTION
# - this function gets called by &MainLib::Misc::GetoptsNArgs
# - switch arguments are tested for validity. Arguments are parsed with highest
#   possible tolerance. This way, syntax errors can reported in accordance to
#   the actual switch, rather than reporting ANY syntax error.
#
sub AddSwitch {
  my $switch = shift;

  # optional switches
  if ($switch =~ m/^debug(=(\d+))?$/i) {
    $ProgOpt{-debug} = defined($2) ? int($2) : 1;
    return;
  }
  if ($switch =~ m/^log(=(.*))?$/i) {
    $ProgOpt{-log} = $2 ? &PathExpand($2) : 1;
    return;
  }
  if ($switch =~ m/^OutDir=(.+)$/i) {
    $ProgOpt{-OutDir} = &PathExpand ($1);
    unless (-d $ProgOpt{-OutDir}) {
      if (-e $ProgOpt{-OutDir}) {
        printf STDERR "ERROR: output destination exists, but is not a directory: %s\n", $ProgOpt{-OutDir}||"''";
        exit 1;
      }
      
      # this dialogue will also appear if there's no output do be done to any
      # file/directory
      else {
        if (int(grep{ $_ eq '-' }@ARGV)) {
          printf STDERR "creating non-existent output directory (skip dialogue in STDIN input mode)\n";
          mkdir ($ProgOpt{-OutDir});
        } else {
          printf STDERR "output directory does not exist, create?";
          if (&QueryConfirm()) {
            mkdir ($ProgOpt{-OutDir});
          } else { exit 1 }
        }
      }
    }
    return;
  }
  if ($switch =~ m/^OutImg(=(.+))?$/i) {
    $ProgOpt{-OutImg} = $2 ? &PathExpand($2) : '';
    return;
  }
  if ($switch =~ m/^OutImgRelhigh=([0-9\.eE+-]+)?$/i) {
    $ProgOpt{-OutImg} ||= '';
    $ProgOpt{-OutImgRelhigh} = $1;
    return;
  }
  if ($switch =~ m/^OutImgTransp(ar)?$/i) {
    $ProgOpt{-OutImg} ||= '';
    $ProgOpt{-OutImgTransp} = 1;
    return;
  }
  if ($switch =~ m/^OutImgWidth=(\d+)?$/i) {
    $ProgOpt{-OutImg} ||= '';
    $ProgOpt{-OutImgWidth} = $1;
    return;
  }
  if ($switch =~ m/^OutSt[au]mp=(.+)$/i) {
    $ProgOpt{-OutStump} = $1;
    return;
  }
  if ($switch =~ m/^RedoSize(=(\d+))$/i) {
    $ProgOpt{-RedoSize} = $2;
    return;
  }
  if ($switch =~ m/^SampleSize(=(\d+))$/i) {
    $ProgOpt{-SampleSize} = $2;
    return;
  }
  if ($switch =~ m/^StepSize=([\d.eE+-]+)$/i) {
    $ProgOpt{-StepSize} = $1;
    return;
  }
  if ($switch =~ m/^WinSize=([\d.eE+-]+)$/i) {
    $ProgOpt{-WinSize} = $1;
    return;
  }
  if ($switch =~ m/^v(erbose)?$/i) {
    $ProgOpt{-verbose} = 1;
    return;
  }

  # program mode switches
  if (defined $ProgMode) {
    printf STDERR "ERROR: multiple specification of program mode or unknown switch, %s and %s\n",
      '-'.($ProgMode||"''"), '-'.($switch||"''");
    exit 1;
  }
  else {
    $ProgMode = $switch;
    return;
  }
}


# work out output path base from source file and/or switches
#
# INTERFACE
# - global options:
#   -debug       [STD]
#   -OutDir      [STD]
#   -OutStump    [STD]
# - return val:  output path base
#
sub PrepOstump {
  my ($debug);
  my ($PathStamp, $PathRef);

  # function parameters
  $debug = $ProgOpt{-debug};

  # prepare output path base
  # 1st: $ProgOpt{-OutStump}
  # 2nd: $ProgArg[0]
  unless ($PathStamp = $ProgOpt{-OutStump}) {
    $PathRef = $ProgArg[0];
    if ($PathRef eq '-') { $PathRef = 'stdin'; }
    $PathStamp = &PathChgSuffix (&PathExpand($PathRef), '', -last=>1);
  }

  # determine effective directory
  $ProgOpt{-OutDir} and $PathStamp =
    $ProgOpt{-OutDir} .'/'. &PathSplit($PathStamp)->{name};

  # final refinement: expand to nice, rooted path
  $PathStamp = &PathExpand ($PathStamp);

  # return path
  return $PathStamp;
}


################################################################################
# analysis and presentation
################################################################################


# simulation of sequence alignment corresponding to a specified phylogenetic tree
#
# INTERFACE
# - argument 1: template tree file
#
# - global options:
#   -debug      [STD]
#
# DESCRIPTION
# - negative or zero branch lengths in the template tree will cause a
#   failure of the function
#
sub ProgTreeImg {
  my ($PathSrc) = @_;
  my ($debug, $dbg2);
  my (%path, $pTree);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;

  # load tree
  $debug and printf STDERR "%s. get tree data from file %s\n", &MySub, $PathSrc;
  unless ($pTree = &TreeFromNewick ($PathSrc, -debug=>$dbg2)) {
    printf STDERR "ERROR when loading tree file %s\n", $PathSrc||"''";
    exit 1;
  }

  # draw tree
  $path{img} = $ProgOpt{-OutImg} || &PrepOstump() .'_tree.png';
  &WriteFile ("$path{img}", &TreeRootedImg ($pTree,
    -ImgWidth   => $ProgOpt{-OutImgWidth},
    -ImgRelhigh => $ProgOpt{-OutImgRelhigh},
    -debug      => $dbg2));
}


################################################################################
# simulations
################################################################################


# batch script mode -AlnSimulPhyl
#
# INTERFACE
# - argument 1: tree template file
#
# - global options:
#   -debug      [STD]
#
sub ProgAlnSimulPhylBatch {
  my ($PathSrc) = @_;
  my ($debug, $dbg2, $SwitchSampleSize, $SwitchStepSize);
  my (%path, %ItCalcLib, @ItCalcSpec, $pItCalc, $DistVar);
  my ($TreeTempl, $TreeCurr, $NumReplica, $call);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $path{stamp} = &PrepOstump();
  $path{stampname} = &PathSplit ($path{stamp})->{name};
  $SwitchSampleSize = '-SampleSize=10000';
  $SwitchStepSize = '-StepSize=0.0025';
  @ItCalcSpec = qw(12_JC08 12_JC08b);
#  @ItCalcSpec = qw(2a 2b 2c);

  # iterator parameters for distance sets
  # long range, homogeneous distance set
  $ItCalcLib{'2a'} = { offset=>0.00, step=>0.02, limit=>0.4001, format=>'%.2f', replica=>3,
    func => sub { my $pSet = $_[0];
      $$pSet{DistA} = $$pSet{it};
      $$pSet{DistB} = $$pSet{DistC} = 0;
    } };
  $ItCalcLib{'2b'} = { offset=>0.40, step=>0.05, limit=>1.0001, format=>'%.2f', replica=>3,
    func => $ItCalcLib{'2a'}{func} };
  $ItCalcLib{'2c'} = { offset=>1.00, step=>0.10, limit=>2.5001, format=>'%.2f', replica=>3,
    func => $ItCalcLib{'2a'}{func} };
  # branched set - inside range, long step distance set
  $ItCalcLib{'12_JC08'} = { offset=>0, step=>0.04, limit=>0.3601, format=>'%.2f',
    reploff=>0, replica=>2,
    func => sub { my $pSet = $_[0];
      $$pSet{DistA} = 0.4;
      $$pSet{DistB} = $$pSet{it};
      $$pSet{DistC} = $$pSet{DistA} - $$pSet{DistB};
    } };
  $ItCalcLib{'12_JC08b'} = { offset=>0, step=>0.04, limit=>0.3601, format=>'%.2f',
    reploff=>2, replica=>98,
    func => sub { my $pSet = $_[0];
      $$pSet{DistA} = 0.4;
      $$pSet{DistB} = $$pSet{it};
      $$pSet{DistC} = $$pSet{DistA} - $$pSet{DistB};
    } };
  $ItCalcLib{'12_JC13'} = { offset=>0, step=>0.065, limit=>0.6001, format=>'%.3f',
    reploff=>0, replica=>100,
    func => sub { my $pSet = $_[0];
      $$pSet{DistA} = 0.65;
      $$pSet{DistB} = $$pSet{it};
      $$pSet{DistC} = $$pSet{DistA} - $$pSet{DistB};
    } };
  $ItCalcLib{'12_JC20'} = { offset=>0, step=>0.1, limit=>0.9001, format=>'%.2f',
    reploff=>0, replica=>100,
    func => sub { my $pSet = $_[0];
      $$pSet{DistA} = 1;
      $$pSet{DistB} = $$pSet{it};
      $$pSet{DistC} = $$pSet{DistA} - $$pSet{DistB};
    } };

  # load tree template
  $debug and printf STDERR "%s. loading tree template %s\n", &MySub, $PathSrc;
  $TreeTempl = &ReadFile ($PathSrc);

  # LOG parameters
  $path{LogParam} = sprintf ('%s_param.dat', $path{stamp});
  my $hOut = FileHandle->new($path{LogParam},'w');
  &DataPrint (
    { IterateSpec => join (', ', @ItCalcSpec),
      iterate => [ map { $ItCalcLib{$_} } @ItCalcSpec ],
      SwitchSampleSize => $SwitchSampleSize,
      SwitchStepSize => $SwitchStepSize,
      TreeTemplat => $PathSrc,
    }, -handle=>$hOut);

  # loop over iteration sets, iterate according to set
  foreach $pItCalc (map { $ItCalcLib{$_} } @ItCalcSpec) { 
    for (
      $$pItCalc{it}=$$pItCalc{offset}+$$pItCalc{step}; 
      $$pItCalc{it}<=$$pItCalc{limit}; 
      $$pItCalc{it}+=$$pItCalc{step}
    ) {

      # define set of distances in current iteration step
      &{$$pItCalc{func}} ($pItCalc);

      # define current tree
      $TreeCurr = $TreeTempl;
      foreach $DistVar (qw(DistA DistB DistC)) {
        $TreeCurr =~ s/$DistVar/$$pItCalc{$DistVar}/eg;
      }

      # replica loop
      foreach $NumReplica ((1+$$pItCalc{reploff}) .. (&Max ($$pItCalc{replica}, 1)+$$pItCalc{reploff})) {

        # string representation of current iterator
        $$pItCalc{itstr} = sprintf ($$pItCalc{format}, $$pItCalc{it});
        if ($$pItCalc{reploff} or $$pItCalc{replica} > 1) {
          $$pItCalc{itstr} .= "r$NumReplica";
        }

        # imagine file paths, write tree file
        $path{tree} = sprintf ('%s_%s.tre', $path{stamp}, $$pItCalc{itstr});
        &WriteFile ($path{tree}, $TreeCurr);
        $path{seqroot} = sprintf ('%s_%s_seq', $path{stamp}, $$pItCalc{itstr});
        $path{seqrootname} = &PathSplit ($path{seqroot})->{name};
        $path{aln} = sprintf ('%s_%s_seq.aln', $path{stamp}, $$pItCalc{itstr});

        # simulate sequences, analyse pairwise dissimilarity
        unlink "$path{seqroot}.log";
        system (join (' ', $CorePath{call}{AlnSimulPhyl},
          $SwitchSampleSize, $SwitchStepSize, "-log=$path{seqroot}.log", $path{tree}));
        system ("Convert.pl '-search=column labels:\\n# ' '-chg=\$\&label\\t' $path{seqroot}_DistAdv.tab > $path{seqroot}_DistAdv.tab.new");
        system ("Convert.pl '-search=\\n([^#])' '-chg=\\n$$pItCalc{itstr}\\t\$1' $path{seqroot}_DistAdv.tab.new > $path{seqroot}_DistAdv.tab");
        unlink "$path{seqroot}_DistAdv.tab.new";

        # display result
        system "cat $path{seqroot}.log";
        system "cat $path{seqroot}_DistAdv.tab";
      }
    }
  }

  # summarise batch results
  system "cat `ls $path{stampname}_*_seq_DistAdv.tab | head -1` | grep '^#' > $path{stamp}_dist.tab";
  system "cat $path{stampname}_*_seq_DistAdv.tab | grep '^[^#]' | sort >> $path{stamp}_dist.tab";

#  # process summary, better do this manually
#  # - change scale of seq pair / distance parameter into effective distance parameter
#  # - eventually group seq pairs
#  # - compute desired data plot graph
#  system (join (' ', "cp", "$path{stamp}_dist.tab", "$path{stamp}_distdat.tab"));
#  $call = join (' ', "$CorePath{call}{PerlScript}/Plot.pl $path{stamp}_distdat.tab");
#  $debug and printf STDERR "%s. calling:\n  %s\n", &MySub, $call;
#  system ($call);
}


# simulation of sequence alignment corresponding to a specified phylogenetic tree
#
# INTERFACE
# - argument 1: template tree file
#
# - global options:
#   -debug      [STD]
#
# DESCRIPTION
# - negative or zero branch lengths in the template tree will cause a
#   failure of the function
#
sub ProgAlnSimulPhyl {
  require SeqLab::SeqFormat;
  my ($PathSrc) = @_;
  my ($SeqAlphabet, $ClockTick, $debug, $dbg2, %path, $iSeqLen);
  my ($pTree, $pNode, $sSeq, $pSeqAln);
  my ($pColumn, $pTabDist3d, $pTabDist2d, $pTabDistCell, $ItId1, $ItId2);

  # function constants
  $SeqAlphabet = 'ACGT';
  $ClockTick = $ProgOpt{-StepSize} || 0.00333;

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $path{stamp} = &PrepOstump();
  $iSeqLen = $ProgOpt{-SampleSize} || 10000;

  # load tree
  $debug and printf STDERR "%s. get tree data from file %s\n", &MySub, $PathSrc;
  unless ($pTree = &TreeFromNewick ($PathSrc, -debug=>$dbg2)) {
    printf STDERR "ERROR loading tree file %s\n", $PathSrc||"''";
    exit 1;
  }
  if ($debug) {
    # WARNING for debug: backlinks in $pTree!
    &WriteFile ("${PathSrc}_debug.png", &TreeRootedImg ($pTree, -debug=>$dbg2));
#    (&TreeLeafCross ($pTree, 'TRE3-A', 'TRE3-D', -debug=>1))[1];
  }
  # check for valid branch lengths
  if (grep { $_->{length}<=0 } @{$$pTree{branch}}) {
    printf STDERR "ERROR: negative or zero branch length in tree %s\n", $PathSrc||"''";
    exit 1;
  }

  ##############################################################################
  # randomly generate sequence data

  # generate random sequence as a primer
  $sSeq = &RandStr ($iSeqLen, -chars=>$SeqAlphabet, -debug=>$dbg2);

  # do recursion on tree nodes to
  # - mutate template sequence
  # - branch into subtrees
  my $pActNode;
  $pActNode = sub {
    my ($pNode, $sSeq) = @_;
    $$pNode{seq} = $sSeq;
    foreach my $pBranch (@{$$pNode{branch}}) {
      $sSeq = $$pNode{seq};
      # mutate sequence
      $$pBranch{ClockTick} = $$pBranch{length}
        / (int ($$pBranch{length} / $ClockTick) + 1.00001);
      $$pBranch{ClockRing} = $$pBranch{ClockTick} * length($SeqAlphabet) / (length($SeqAlphabet)-1);
      for (my $CtClock=$$pBranch{ClockTick}; $CtClock<=$$pBranch{length}; $CtClock+=$$pBranch{ClockTick}) {
        $$pBranch{ClockTicked} ++;
        for (my $CtPos=0; $CtPos<$iSeqLen; $CtPos++) {
          if (rand() < $$pBranch{ClockRing}) {
            $$pBranch{ClockRang} ++;
            substr ($sSeq, $CtPos, 1) = &RandStr (1, -chars=>$SeqAlphabet);
          }
        }
      }
      if ($debug||1) {
        printf "ProgAlnSimulPhyl::ActNodeR. \n";
        printf "  branch length: %s\n", $$pBranch{length};
        printf "  clock tick: parameter %s / effective %s\n", $ClockTick, $$pBranch{ClockTick};
        printf "  clock ticked/rang, per site: %s/%s\n",
          $$pBranch{ClockTicked}, $$pBranch{ClockRang}/$iSeqLen;
      }
      # continue on next node recursively
      &$pActNode ($$pBranch{node}, $sSeq);
    }
  };
  &$pActNode ($$pTree{RootNode}, $sSeq);

  ##############################################################################
  # analyse sequence data
  my $hOutTab;

  # sample alignment, output
  $pSeqAln = &AlnClone ($$pTree{LeafNode}, -KeyStd=>1, -KeySeq=>'seq');
  $path{align} = $path{stamp} . '_seq.aln';
  print  "writing $path{align}\n";
  &WriteFile ($path{align}, &AlnClustalSprint ($pSeqAln));

  # calculate/output actual pair-wise distances
  # - no gaps => no gap masking
  ($pTabDist2d, $pColumn, $pTabDist3d) = &AlnTabDist ($pSeqAln, -debug=>$dbg2);
  ($pTabDist2d, $pColumn) = &TableConvert ('HIH', 'AH', $pTabDist2d,
    -LineLabel=>$pColumn, -ColLabel=>['id',@$pColumn], -debug=>$dbg2);
  $path{DistStd} = $path{stamp} . '_seq_Dist.tab';
  if ($hOutTab = FileHandle->new($path{DistStd},'w')) {
    printf "writing file %s\n", $path{DistStd};
  } else {
    printf STDERR "ERROR: unable to write to file %s\n", $path{DistStd};
    exit 1;
  }
  print  $hOutTab "# pairwise distance table\n";
  printf $hOutTab "# date/time: %s\n", &TimeStr();
  printf $hOutTab "# consensus length: %d\n", length $$pSeqAln[0]{sequence};
  printf $hOutTab "#\n# column labels:\n# %s\n", join ("\t", @$pColumn);
  foreach (@$pTabDist2d) {
    printf $hOutTab "%s\n", join ("\t", @{$_}{@$pColumn});
  }
  undef $pColumn;

  # calculate:
  # - expected pair-wise distances
  # - variances around expected pair-wise distances
  undef $pTabDist2d;
  foreach $ItId1 (sort grep{ !/$reAlnConsens/io } keys %$pTabDist3d) {
    foreach $ItId2 (sort grep{ !/$reAlnConsens/io and $_ ne $SeqLab::SeqFormat::LibGlob{KeyId} } keys %{$$pTabDist3d{$ItId1}}) {
      $$pTabDist3d{$ItId1}{$ItId2} or next;
      $pTabDistCell = $$pTabDist2d{"$ItId1-$ItId2"} = $$pTabDist3d{$ItId1}{$ItId2};
      $$pTabDistCell{DistObs} = $$pTabDistCell{dist}; delete $$pTabDistCell{dist};
      $$pTabDistCell{JukesCantor} = (&TreeLeafCross ($pTree, $ItId1, $ItId2, -debug=>1))[1];
      $$pTabDistCell{DistExpect} = 3/4 * (1 - $const{euler} ** (-4/3 * $$pTabDistCell{JukesCantor}));
      $$pTabDistCell{DistExpectExpvar} = $$pTabDistCell{DistExpect} * (1 - $$pTabDistCell{DistExpect});
      $$pTabDistCell{DistExpectEmpvar} = ($$pTabDistCell{Nshared} > 1) ? (
        $$pTabDistCell{Ndiff} * (1 - $$pTabDistCell{DistExpect})**2 +
        ($$pTabDistCell{Nshared} - $$pTabDistCell{Ndiff}) * $$pTabDistCell{DistExpect}**2
        ) / ($$pTabDistCell{Nshared} - 1) : 0;
    }
  }
  if ($debug) {
    $path{DistDebug} = $path{stamp} . '_seq_DistDebug.tab';
    print  "writing file $path{DistDebug}\n";
    my $hOutDbg = FileHandle->new($path{DistDebug},'w');
    &DataPrint ($pTabDist2d, -handle=>$hOutDbg);
  }
  ($pTabDist2d, $pColumn) = &TableConvert ('HIH', 'AH', $pTabDist2d, -ColLabel=>['SeqPair'], -debug=>$dbg2);
  $path{DistAdv} = $path{stamp} . '_seq_DistAdv.tab';
  if ($hOutTab = FileHandle->new($path{DistAdv},'w')) {
    printf "writing file %s\n", $path{DistAdv};
  } else {
    printf STDERR "ERROR: unable to write to file %s\n", $path{DistAdv};
    exit 1;
  }
  print  $hOutTab "# advanced pairwise distance table\n";
  printf $hOutTab "# date/time: %s\n", &TimeStr();
  printf $hOutTab "# consensus length: %d\n", length $$pSeqAln[0]{sequence};
  printf $hOutTab "#\n# column labels:\n# %s\n", join ("\t", @$pColumn);
  foreach (@$pTabDist2d) {
    printf $hOutTab "%s\n", join ("\t", @{$_}{@$pColumn});
  }
}


# simulate sequence alignemnt randomly
#
# INTERFACE
# - argument 1:  parameter rc file
# - argument 2*: redo counter
#
# - global options:
#   -debug       [STD]
#
# DESCRIPTION
# - procedure:
#   - construct random alignment containing polymorphisms (low sequence
#     distance)
#   - output alignment and alignment properties
#
# DEBUG, CHANGES, ADDITIONS
# - Wenn zwei Polymorphismen sich an einer Sequenzposition berlagern -
#   Partition je 0.3 -, dann hat der eine Polymorphismus eine Frequenz von 0.3,
#   der andere 0.21. Ist das korrekt?
# - check the default output path concept
#
sub ProgAlnSimulRand {
  my ($ArgRc, $RedoNum) = @_;
  my ($debug, $dbg2, $OutStump);
  my ($pRc, $pRcPolymorph, $CtPolymorph, @HavePolymorph);
  my ($pAlnAc, $CtCol, $pAln, $pAlnSmb, $pSmb);
  my ($pTabConsDev, @column, %ColLabel, @resolve, $SeqSlice);
  my ($CtSeq, $AlnSeq, $CtPos);
  my (%file, $hOutTab, $hOutDat);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $OutStump = $ProgOpt{-OutStump} || './AlnSimulRand';
  $debug and printf STDERR "%s. entered SUB\n", &MySub;

  ##############################################################################
  # pre-work

  # directives from rc file
  unless ($pRc = &DataRead ($ArgRc)) {
    printf STDERR "ERROR: no rc data in file %s\n", $ArgRc||"''";
    exit 1;
  }
  foreach (qw(CopyNum CopySize Polymorph)) {
    if (! exists $$pRc{$_}) {
      printf STDERR "ERROR: directive %s is undefined in rc file, exiting\n", $_||"''";
      exit 1;
    }
  }
  foreach $pRcPolymorph (@{$$pRc{Polymorph}}) {
    unless ((exists $$pRcPolymorph{Num} or exists $$pRcPolymorph{NumRelPercons})
         and exists $$pRcPolymorph{Partit}) {
      printf STDERR "ERROR: directive in polymorphism entry is undefined, exiting\n";
      exit 1;
    }
    $$pRcPolymorph{Num} ||= &nearest (1, $$pRcPolymorph{NumRelPercons}*$$pRc{CopySize});
    $$pRcPolymorph{PartitNum} = &nearest (1, $$pRcPolymorph{Partit}*$$pRc{CopyNum});
  }

  ##############################################################################
  # generate alignment

  # generate data pool
  # - table data structure type AC
  # - use numbers as sequence symbols
  $pAlnAc = [];
  for ($CtCol=0; $CtCol<$$pRc{CopySize}; $CtCol++) {
    $$pAlnAc[$CtCol] = [ (0) x $$pRc{CopyNum} ];
  }
  foreach $pRcPolymorph (@{$$pRc{Polymorph}}) {
    for ($CtPolymorph=0; $CtPolymorph<$$pRcPolymorph{Num}; $CtPolymorph++) {
      $CtCol = int rand $$pRc{CopySize};
      for (0 .. ($$pRcPolymorph{PartitNum}-1)) {
        $$pAlnAc[$CtCol][$_] = $HavePolymorph[$CtCol] + 1;
      }
      $$pAlnAc[$CtCol] = &RandArrayOrder ($$pAlnAc[$CtCol]);
      $HavePolymorph[$CtCol] ++;
    }
  }

  # nt symbol version of the simulation
  $pAlnSmb = &DataClone ($pAlnAc);
  foreach $CtCol (@$pAlnSmb) {
    $pSmb = &RandArrayOrder ([qw(A C G T)]);
    @$CtCol = map { $$pSmb[$_] } @$CtCol;
  }
  # shuffle columns, convert to alignment
  $CtSeq = 0;
  $pAln = &TableConvert ('AC', 'AA', $pAlnAc);
  $pAlnSmb = [ map { { id => sprintf ('Rand%s', ++$CtSeq), sequence => join ('', @$_) }; }
    @{ &TableConvert ('AC', 'AA', $pAlnSmb) }
    ];

  # output alignment
  &WriteFile ("${OutStump}${RedoNum}.aln", &AlnClustalSprint ($pAlnSmb));

  ##############################################################################
  # alignment properties

  # plots:
  # - consensus deviations
  # - symbol diversity
  $pTabConsDev = (&AlnTabConsDev ($pAlnSmb, -debug=>$dbg2))[0];
  $file{ConsDev} = "${OutStump}${RedoNum}_ConsDev.tab";
  if ($hOutTab = FileHandle->new($file{ConsDev},'w')) {
    printf "writing file %s\n", $file{ConsDev};
  } else {
    printf STDERR "ERROR: unable to write file %s\n", $file{ConsDev};
  }
  @column            = qw(pos class emit     freq      id );
  @ColLabel{@column} = qw(pos class emission frequency IDs);
  print  $hOutTab "# table of consensus deviations (polymorphisms)\n";
  printf $hOutTab "# date/time: %s\n", &TimeStr();
  printf $hOutTab "# consensus length: %d\n", $$pRc{CopySize};
  printf $hOutTab "# symbol diversity (pi): %.6f\n", &AlnDiversity ($pAlnSmb, -ConsLen=>$$pRc{CopySize}, -ConsDev=>$pTabConsDev);
  printf $hOutTab "#\n# column labels:\n# %s\n", join ("\t", @ColLabel{@column});
  foreach $CtPos (sort { $$a{pos}   <=> $$b{pos} or
                         $$a{class} cmp $$b{class} or
                         $$a{emit}  cmp $$b{emit} } @$pTabConsDev) {
    printf $hOutTab "%s\n", join ("\t", @{$CtPos}{@column});
  }

  # assembly resistance
  foreach $CtSeq (@$pAlnSmb) {
    $CtPos = 1;
    ProgAlnSimulRandSubstr: while ($CtPos <= $$pRc{CopySize}) {
      $SeqSlice = substr ($$CtSeq{sequence}, 0, $CtPos);
      if ((grep { $SeqSlice eq $_ }
           map  { substr ($_->{sequence}, 0, $CtPos) } @$pAlnSmb) > 1) {
        $CtPos ++; next ProgAlnSimulRandSubstr;
      } else {
        last ProgAlnSimulRandSubstr;
      }
    }
    push @resolve, $CtPos;
  }
  $file{ResolveData} = "${OutStump}${RedoNum}_ResolveData.dat";
  &WriteFile ($file{ResolveData}, join ('', map { "$_\n" } @resolve));
  $file{ResolveStat} = "${OutStump}${RedoNum}_ResolveStat.dat";
  $hOutDat = FileHandle->new($file{ResolveStat},'w');
  &DataPrint (&SampleMetrics(\@resolve,-debug=>$dbg2), -handle=>$hOutDat);
}


# Monte Carlo simulation of acquisition of polymorphism data from shotgun reads
#
# INTERFACE
# - argument 1:  parameter rc file
# - argument 2*: redo counter
#
# - global options:
#   -debug       [STD]
#
# DESCRIPTION
# - procedure:
#   - construct random alignment containing polymorphisms (low sequence
#     distance)
#   - output alignment properties
#   - do shotgun drawings, record alignment coverage and proportion of
#     localized polymorphisms
#
# DEBUG, CHANGES, ADDITIONS
# - implement subcalling of &ProgAlnSimul
# - check the default output path concept
#
sub ProgShotgunPolymorph {
  my ($ArgRc, $RedoNum) = @_;
  my ($debug, $dbg2, $OutStump);
  my ($pRc, $pRcPolymorph, $pAlnAc, $CtCol, $CtPolymorph, @HavePolymorph, $pAln);
  my ($pAlnSmb, $pSmb, $pTabConsDev, @column, %ColLabel, @resolve, $SeqSlice);
  my ($CtSeq, $AlnSeq, $iSeqLen, $SeqOff, $CtNt, $CtPos, @emitted);
  my (@plot, $CtEmit);
  my (%file, $hOutTab, $hOutDat);

  # function parameters
  $debug = $ProgOpt{-debug} = 1;
  $dbg2  = $debug ? $debug-1 : undef;
  $OutStump = $ProgOpt{-OutStump} || './ShotgunPolymorph';
  $debug and printf STDERR "%s. entered SUB\n", &MySub;

  ##############################################################################
  # pre-work

  # directives from rc file
  unless ($pRc = &DataRead($ArgRc)) {
    printf STDERR "ERROR: no rc data in file %s\n", $ArgRc||"''";
    exit 1;
  }
  foreach (qw(CopyNum CopySize SeqNum SeqLen Polymorph)) {
    if (! exists $$pRc{$_}) {
      printf STDERR "ERROR: directive %s is undefined in rc file, exiting\n", $_||"''";
      exit 1;
    }
  }
  ref($$pRc{SeqLen}) or $$pRc{SeqLen} = [ $$pRc{SeqLen} ];
  foreach $pRcPolymorph (@{$$pRc{Polymorph}}) {
    unless ((exists $$pRcPolymorph{Num} or exists $$pRcPolymorph{NumRelPercons})
         and exists $$pRcPolymorph{Partit}) {
      printf STDERR "ERROR: directive in polymorphism entry is undefined, exiting\n";
      exit 1;
    }
    $$pRcPolymorph{Num} ||= &nearest (1, $$pRcPolymorph{NumRelPercons}*$$pRc{CopySize});
    $$pRcPolymorph{PartitNum} = &nearest (1, $$pRcPolymorph{Partit}*$$pRc{CopyNum});
  }

  ##############################################################################
  # generate alignment

  # generate data pool
  $pAlnAc = [];
  for ($CtCol=0; $CtCol<$$pRc{CopySize}; $CtCol++) {
    $$pAlnAc[$CtCol] = [ (0) x $$pRc{CopyNum} ];
  }
  foreach $pRcPolymorph (@{$$pRc{Polymorph}}) {
    for ($CtPolymorph=0; $CtPolymorph<$$pRcPolymorph{Num}; $CtPolymorph++) {
      $CtCol = int rand $$pRc{CopySize};
      for (0 .. ($$pRcPolymorph{PartitNum}-1)) {
        $$pAlnAc[$CtCol][$_] = $HavePolymorph[$CtCol] + 1;
      }
      $$pAlnAc[$CtCol] = &RandArrayOrder ($$pAlnAc[$CtCol]);
      $HavePolymorph[$CtCol] ++;
    }
  }

  # nt symbol version of the simulation
  $pAlnSmb = &DataClone ($pAlnAc);
  foreach $CtCol (@$pAlnSmb) {
    $pSmb = &RandArrayOrder ([qw(A C G T)]);
    @$CtCol = map { $$pSmb[$_] } @$CtCol;
  }

  # shuffle columns, convert to alignment
  $pAln = &TableConvert ('AC', 'AA', $pAlnAc);
  $pAlnSmb = [ map { { sequence=>join('',@$_) }}
     @{ &TableConvert ('AC', 'AA', $pAlnSmb) }];

  ##############################################################################
  # alignment properties

  if ($debug||1) {
    foreach (@$pAln) {
      printf STDERR "%s\n", join ('', @$_);
    }
    print  STDERR "\n";
    foreach $CtSeq (@$pAlnSmb) {
      printf STDERR "%s\n", $$CtSeq{sequence};
    }
    print  STDERR "\n";
  }

  # plots:
  # - consensus deviations
  # - symbol diversity
  $pTabConsDev = (&AlnTabConsDev($pAlnSmb,-debug=>$dbg2))[0];
  $file{ConsDev} = "${OutStump}${RedoNum}_ConsDev.tab";
  if ($hOutTab = FileHandle->new($file{ConsDev},'w')) {
    printf "writing file %s\n", $file{ConsDev};
  } else {
    printf STDERR "ERROR: unable to write file %s\n", $file{ConsDev};
  }
  @column            = qw(pos class emit     freq      id );
  @ColLabel{@column} = qw(pos class emission frequency IDs);
  print  $hOutTab "# table of consensus deviations (polymorphisms)\n";
  printf $hOutTab "# date/time: %s\n", &TimeStr();
  printf $hOutTab "# consensus length: %d\n", $$pRc{CopySize};
  printf $hOutTab "# symbol diversity (pi): %.6f\n", &AlnDiversity ($pAlnSmb, -ConsLen=>$$pRc{CopySize}, -ConsDev=>$pTabConsDev);
  printf $hOutTab "#\n# column labels:\n# %s\n", join ("\t", @ColLabel{@column});
  foreach $CtPos (sort {
      $$a{pos} <=> $$b{pos} or
    $$a{class} cmp $$b{class} or
     $$a{emit} cmp $$b{emit} } @$pTabConsDev
  ) {
    printf $hOutTab "%s\n", join ("\t", @{$CtPos}{@column});
  }

  # assembly resistance
  foreach $CtSeq (@$pAlnSmb) {
    $CtPos = 1;
    ProgShotgunPolymorphSubstr: while ($CtPos <= $$pRc{CopySize}) {
      $SeqSlice = substr ($$CtSeq{sequence}, 0, $CtPos);
      if ((grep { $SeqSlice eq $_ }
           map  { substr ($_->{sequence}, 0, $CtPos) } @$pAlnSmb) > 1) {
        $CtPos ++; next ProgShotgunPolymorphSubstr;
      } else {
        last ProgShotgunPolymorphSubstr;
      }
    }
    push @resolve, $CtPos;
  }
  $file{ResolveData} = "${OutStump}${RedoNum}_ResolveData.dat";
  &WriteFile ($file{ResolveData}, join ('', map { "$_\n" } @resolve));
  $file{ResolveStat} = "${OutStump}${RedoNum}_ResolveStat.dat";
  $hOutDat = FileHandle->new($file{ResolveStat},'w');
  &DataPrint (&SampleMetrics(\@resolve,-debug=>$dbg2), -handle=>$hOutDat);

  ##############################################################################
  # do shotgun drawings

  $CtPolymorph = 0;
  push @plot, { seq=>0, nt=>0, AlnCover=>0, polymorph=>0, RelPolymorph=>0 };

  foreach $CtSeq (1 .. $$pRc{SeqNum}) {

    # draw sequence stretch from alignment
    # note: alignment sequence is represented by a reference to an array of
    #   positional values
    $AlnSeq = &RandArrayValue ($pAln);
    $iSeqLen = &RandArrayValue ($$pRc{SeqLen});
    # positions are held in computational logics
    $SeqOff = &RandInt ($$pRc{CopySize} - $ProgParam{shotgun}{overlap},
                -min=>(-$iSeqLen + $ProgParam{shotgun}{overlap}));

    # neglect overhangs in sequence stretch
    if ($SeqOff < 0) {
      $iSeqLen += $SeqOff;
      $SeqOff = 0;
    }
    $iSeqLen = &Min ($iSeqLen, $$pRc{CopySize} - $SeqOff);
    $CtNt += $iSeqLen;
    $debug and printf STDERR "overlap %d\n", $iSeqLen;

    # evaluate sequence stretch
    for ($CtPos=$SeqOff; $CtPos<($SeqOff+$iSeqLen); $CtPos++) {
      if (%{$emitted[$CtPos]} and
          ! $emitted[$CtPos]{$$AlnSeq[$CtPos]}) {
        $CtPolymorph ++;
      }
      $emitted[$CtPos]{$$AlnSeq[$CtPos]} = 1;
    }

    # add count to plot
    push @plot, {
      seq          => $CtSeq,
      nt           => $CtNt,
      AlnCover     => $CtNt / @{$$pAln[0]},
      polymorph    => $CtPolymorph,
      RelPolymorph => $CtPolymorph / $$pRc{PolymorphNum},
      };
  }
  if ($debug) {
    print  STDERR "\n";
    foreach $CtEmit (0, 1) {
      printf STDERR "%s\n", join ('', map{ $_->{$CtEmit}?$CtEmit:' ' } @emitted);
    }
    print  STDERR "\n";
  }

  # output plots
  $file{FinData} = "${OutStump}${RedoNum}.dat";
  $hOutTab = FileHandle->new($file{FinData},'w');
  foreach (@plot) {
    printf $hOutTab "%s\n", join ("\t", @{$_}{'nt','polymorph'});
  }
  $file{FinNorm} = "${OutStump}${RedoNum}_norm.dat";
  $hOutTab = FileHandle->new($file{FinNorm},'w');
  foreach (@plot) {
    printf $hOutTab "%s\n", join ("\t", @{$_}{'AlnCover','RelPolymorph'});
  }
}
# $Id: Phylon.pl,v 1.18 2008/06/11 08:44:58 szafrans Exp $
