################################################################################
#
#  kPerl Sequence Laboratory
#  Library for Contig Set Data Handling
#
#  copyright (c)
#    Institute of Molecular Biotechnology Jena, Dept. Genome Analysis, 2000-2002,2004
#  author
#    Karol Szafranski, karol.szafranski@leibniz-fli.de
#
################################################################################
#
#  DESCRIPTION
#
# - purpose
#   This package deals with sequence alignments at an abstract topological
#   level, i.e. independent from an actual sequence.
#
# - individual description of functions at the beginning of the code blocks
#
# - analysis of clone information (meant to be presented in a contig set data
#   structure) funtamentally relies on read identifier syntax rules which are
#   case-specific (project-specific) in some respect. These read identifier
#   syntax rules should be implemented in &ReadWatch::Read::ReadidToFields
#
#   dependencies:
#   - &ContigStructClone -> &ReadWatch::Read::ReadidToFields
#
#   However, the code should work with the read naming syntax concept
#   established at the GSC Jena.
#
################################################################################
#
#  FUNCTIONS, DATA
#
#   @EXPORT
#   %_LibParam
#
# - basics
#   $_LibParam{TmpManag}
#   contig set data structure
#
# - contig set data I/O and manipulation
#   &ContigStructMeta
#   &ContigStruct  (not exported)
#
# - manage contigs in contig set data
#   &ContigId
#   &ContigSeq
#   &ContigSpecif
#
# - manage contig ends in contig set data
#   &CendId
#   &CendIdSort
#   &CendCntrCend
#
# - manage reads / clones in contig set data
#   &ContigStructReaddel
#   &ContigStructClone
#
#
#  STD OPTIONS
#
#   -debug      print debug protocol to STDERR
#
################################################################################
#
#  DEBUG, CHANGES, ADDITIONS
#
# - look also for notes in the header of each function block
#
################################################################################

package SeqAlign::Assembly;

# includes
use strict; #use warnings;  # OK 20040818
use MainLib::Data;
use MainLib::Path;
use MainLib::FileTmp;
use MainLib::Misc qw(&MySub);
use Math::kCalc qw(&Sum);
use database::DbPlain;
use SeqLab::SeqBench qw(%SyntaxTranslNtdrc);
use SeqAlign::Gap;
use ReadWatch::Read;
use ReadWatch::Library;

# symbol export
use Exporter qw(import);
our @EXPORT = qw (
  &ContigStructMeta
  &ContigId &ContigSeq &ContigSpecif
  &CendId &CendIdSort &CendCntrCend
  &ContigStructReaddel
  );

# package-wide constants and variables
my %_LibParam;


################################################################################
# basics
################################################################################


# handle temporary files globally
# - encapsulation in a sub{} allows MainLib::FileTmp to identify the calling
#   package
my $pcFT = sub{ $_LibParam{TmpManag} = MainLib::FileTmp->new(); };
&$pcFT;


# contig set data structure
#
# DESCRIPTION
# - &ContigStruct parses this from contig set data sources, see function
#   to learn about source format capabilities.
# - a leading 'o' mark data entries which are returned from &ContigStruct
#   only if an appropriate option is set.
# - NOTE: all positional information should refer to the visible sequence
#   ranges in the assembly
#
# %root
# =====
#
# source->@      reference to array of sources, see %source
# contig->%      reference to hash of contigs, see %contig
#                temporarily reference to array of contigs in &ContigStruct
# clone->%       reference to hash of clones -> primer -> number
#                cf. &ReadWatch::Read::ReadTreeIndex(..., -format=>'clone')
#                  for details
#                To loop over the primer entries of a clone one has to skip
#                  at least the id entry!
# read->%        reference to hash of readings, each of them itself
#                represented by a hash, see %read
#                temporarily reference to array of contigs in &ContigStruct
# annotation->@  reference to array of annotations, each of them itself
#                represented by a hash, see %annotation
#                set option -annot in &ContigStruct
#
# %source
# =======
#
#   path         path of contig data source
#   type         format of contig data source, one of:
#                'GAP4 DB', 'GAP4 show relationships'
#
# %contig
# =======
#
#   root         reference to data structure root
#   source       reference to source data entry
#   number       number of the contig (GAP4's positional contig number)
#   id           alphanumerical ID of the leftmost reading
# ( idnum        numerical ID of the leftmost reading
#                This field is valid only for source formats GAP4-DB, GAP4-
#                ShowRel.
#   length       contig length
#   read->%      reference to hash of readings
#   clone->%     reference to hash of clones -> hash of primer directions
#                  -> read suffix -> reads
#                cf. &ReadWatch::Read::ReadTreeIndex(..., -format=>'clone')
#                  for details
#                To loop over the primer entries of a clone one has to skip
#                  at least the id entry!
#   effoff       effective offset of reliable part of the contig
#                Only contig ends are concerned here, unreliable islands
#                not taken into account.
#                set option -RelyOn in &ContigStruct or the value will be 1.
#   effend       effective end of reliable part of the contig
#                Only contig ends are concerned here, unreliable islands
#                not taken into account.
#                set option -RelyOn in &ContigStruct or the value will be $contig{length}.
# o end          hash of contig ends (-1: left end, 1: right end).
#                set option -cend in &ContigStruct
# o sequence     reference to sequence entry data structure
#                see &ContigSeq
# o annotation->@  reference to array of annotations, see %annotation
#                set option -annot in &ContigStruct
#
# %cend
# =====
# - set option -cend in &ContigStruct
#
# o contig       reference to contig data entry
# o idnum        contig end specifier numeric (-1: left, 1: right)
# o id           alphanumeric contig end specifier ("ContigId-EndDirSpec",
#                where ContigId is alphanumeric contig ID, EndDirSpec is
#                "5'" or "3'" according to idnum)
# o read->@      reference to array of reads directed towards this contig end
#
# %clone
# ======
# - the entries are different between $report{clone} and
#   $report{contig}{$contig}{clone} in that respect that the first
#   structure reflects the complete project and the second one only the
#   contig situation.
# - entries in $report{contig}{$contig}{clone}
#   never contain fields marked with 'o' in the following listing.
#
#   id           alphanumerical ID of the clone
#   1 -1 0       primer direction entries -> read suffix -> reads
#                cf. &ReadWatch::Read::TreeIndex(..., -format=>'clone')
#                  for details.
#                To loop over the primer entries of a clone one has to skip
#                  at least the id entry!
# o SeqStatus    sequencing status:
#                0    single read
#                0.5  at least two reads originate from independent primers
#                     (including custom-primed reads)
#                1    both fwd and rev reads exist
#                set option -pair in &ContigStruct (global clone structure only)
#                site of generation is &ContigStructClone
# o DispsPrd     clone-end-derived reads on their own dispersed over several
#                contigs => inconsistency; list of contigs
# o DispsClnEnd  clone-ends dispersed over two contigs
# o DispsPlusOther
#                other than clone-end-derived reads are dispersed over
#                additional contigs; list of contigs
# o DispsRead    reads dispersed over several contigs
# o PairStatus   pairing status - partly after processing by &ScaffdStruct:
#                undef  SeqStatus == 0
#                0      mispaired
#                0.5    unknown
#                1      well-paired (fwd and rev in same contig)
#                set option -pair in &ContigStruct (global clone structure only)
# o PairError    error message for PairStatus == 0
#
# %read
# =====
#   id           alphanumerical ID of the reading
# ( idnum        numerical ID of the reading
#                This field is valid only for source formats GAP4-DB, GAP4-
#                ShowRel.
#   length       reading length
#   contig       reference to contig data structure
#   pos          contig position of the reading ({'-1'}: position of left end,
#                {'1'}: position of right end). Reading end specifications refer
#                to the current topology of the alignment, not to the original
#                orientation of the reading.
#   orient       orientation of the reading in relation to contig
#                (1 for plus, -1 for minus)
#   trace        flag for existence of a trace file. This may change to
#                name of trace file in future implementations
#                *** implement me ***
# o cend         reference to contig end towards which the read is directed
#                set option -cend in &ContigStruct. Presence of this reference
#                will depend from the value set in option -cend supplied to
#                &ContigStruct, set to 0 to reference contig end for all
#                reads.
#                set option -cend in &ContigStruct
# o CEndDist     distance from read offset to contig end. It'll be calculated
#                independent from the value set in option -cend supplied to
#                &ContigStruct
#                set option -cend in &ContigStruct
# o annotation->@  reference to array of annotations, see %annotation
#                set option -annot in &ContigStruct
#
# %annotation
# ===========
# - set option -annot in &ContigStruct
#
# o contig       back-reference to contig data structure
# o cpos         contig position of the annotation offset
# o corient      orientation on the contig ('1': plus, '-1': minus)
# o length       length of the annotation
# o read         back-reference to reading data structure
#                this applies for reading annotations only
# o rpos         reading position of the annotation offset
#                this applies for reading annotations only
# o rorient      orientation on the reading ('1': plus, '-1': minus)
#                this applies for reading annotations only
#


################################################################################
# contig set data I/O and manipulation
################################################################################


# represent contig set data by data structure
#
# INTERFACE
# - argument 1: reference to list of contig data source arguments:
#         1     path of contig set source, which may be:
#               GAP4 database  invoke 'show relationships' report and parse
#                              it
#               GAP4 showrel   parse it
#         2* a  reference to array of additional source arguments
#               like argument 1
#         2* b  reference to array of contig ID selectors
#
# - options:
#   same as &ContigStruct
#
# - return val: - reference to contig set data structure
#               - undef if an error occurs
#
# DESCRIPTION
# - this is meta-wrap of &ContigStruct offering argument syntax alternatives.
#
sub ContigStructMeta {
  my ($pArgSrc,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2 = $debug ? $debug-1 : undef;
  $debug and printf STDERR "%s. switches:\n", &MySub;
  $debug and &DataPrint (\%opt, -handle=>\*STDERR, -space=>2);
  unless (@$pArgSrc) {
    die sprintf "%s. ERROR: need a minimum of 1 arg: <ContigSource>\n", &MySub;
  }

  my ($ItSrc, $pCtgStructMst, $pCtgStructSlv);
  my ($pCtg, $pRead, $pEntry);

  # chain depending on type of additional arguments
  # test: arg[1] is read ID => use as contig selector
  if (@$pArgSrc>1 and $$pArgSrc[1]!~m/\.[rstp]\d$/ and -e $$pArgSrc[1]) {

    # multiple source arguments
    $debug and printf STDERR "%s. multi-source mode\n", &MySub;
    foreach $ItSrc (@$pArgSrc) {
      $pCtgStructSlv = &ContigStruct ($ItSrc,
        %opt, 
        -pair => 0);  # analyse read pairing later on master structure

      # add elements of new data structure into master data structure
      if ($pCtgStructMst) {
        foreach $pRead (values %{$$pCtgStructSlv{read}}) {
          if (exists $$pCtgStructMst{read}{$$pRead{id}}) {
            $debug and printf STDERR "%s. read %s duplicated over sources %s / %s\n", &MySub,
              $$pRead{id}, $$pCtgStructMst{read}{$$pRead{id}}{contig}{source}{path}, $$pCtgStructSlv{source}[0]{path};
            &ContigStructReaddel ($pRead);
          } else {
            $$pCtgStructMst{read}{$$pRead{id}} = $pRead;
          }
        }
        foreach $pCtg (values %{$$pCtgStructSlv{contig}}) {
          unless (int values %{$$pCtg{read}}) {
            $debug and printf STDERR "%s. contig %s from source %s redundant now\n", &MySub,
              $$pRead{id}, $$pCtgStructSlv{source}[0]{path};
            foreach (values %{$$pCtg{end}}) { %$_ = (); }
            %$pCtg = ();
            next;
          }
          $$pCtgStructMst{contig}{$$pCtg{id}} = $pCtg;
          $$pCtg{root} = $pCtgStructMst;
        }
        push @{$$pCtgStructMst{source}}, @{$$pCtgStructSlv{source}};
      }

      # initialize master data structure
      else {
        $pCtgStructMst = $pCtgStructSlv;
      }
    }

    # create new root clone index
    $$pCtgStructMst{clone} = &ReadTreeIndex ($$pCtgStructMst{read},
      -format=>'clone', -idref=>1, -debug=>$dbg2);

    # analyse read pairing now on master structure
    $opt{-pair} and &ContigStructClone ($$pCtgStructMst{clone});
  }

  # single source and possibly contig selector(s)
  else {
    $pArgSrc = &DataClone ($pArgSrc);
    push @{$opt{-SlcContig}}, splice (@$pArgSrc, 1);
    $pCtgStructMst = &ContigStruct ($$pArgSrc[0], %opt);
  }

  ##############################################################################
  # debugging

  if ($debug) {
    # NOTE: with back-references in data structure &DataPrint doesn't work
    printf STDERR "%s. final data structure:\n", &MySub;
    printf STDERR "  sources: %d - 1st %s\n", int(@{$$pCtgStructMst{source}}), $$pCtgStructMst{source}[0]{path};
    printf STDERR "  contigs: %d\n", int(keys %{$$pCtgStructMst{contig}});
    printf STDERR "  clones: %d\n", int(keys %{$$pCtgStructMst{clone}});
    printf STDERR "  readings: %d\n", int(keys %{$$pCtgStructMst{read}});
    if ($opt{-annotation}) {
    printf STDERR "  annotations: %d\n", int(@{$$pCtgStructMst{annotation}})-1;
    }
  }

  # exit SUB
  return $pCtgStructMst;
}


# represent contig set data by data structure
#
# INTERFACE
# - argument 1:    path of contig set source, which may be:
#                  GAP4 database  invoke 'show relationships' report and parse
#                                 it
#                  GAP4 showrel   parse it
#
# - options:
#   -annotation    construct annotation data sub-structures
#   -cend          construct contig end data sub-structures
#                  For a non-zero switch value the analysis will be restricted
#                  on the specified contig end range.
#   -debug         [STD]
#   -OutReport     save showrelationship report to specified file
#                  This works on true GAP4 projects only
#   -pair          analyse paring status for clones
#   -SlcCnum       select for minimum clone number in contig
#   -SlcContig     reference to array of contig IDs
#                  The data structure will be restricted to the specified
#                  contigs. Contig IDs may be either alphanumeric ID or
#                  numeric ID.
#   -SlcLen        select input according to contig length. Argument may be
#                  either:
#                  - reference range object or similar data structure
#                  - string-fashioned range specification. Minimum syntax:
#                    minimum value for contig length
#                  switch takes effect in &GapSeqStruct
#   -SlcRnum       select for minimum reading number in contig
#   -SlcSpecif     select contigs for target specificity. Supply specificity
#                  set ID.
#   -SlcSpecifDlt  minimum delta of specificity measure
#   -RcTgtSpecif   use this rc file for target specificity library (needed
#                  for source/target specificity calculations)
#   -RelyOn        define reads which are reliable (option argument is a
#                  regular expression). Fields
#                   $report{contig}{effend}{'-1'} and
#                   $report{contig}{effend}{'1'}
#                  will be set according to the read status
#   -timer         print time performance protocol to STDERR
#
# - return val:    - reference to contig set data structure
#                  - undef if an error occurs
#
# DESCRIPTION
# - for a description of contig set data structure see above
#
sub ContigStruct {
  my ($ArgSrc,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2 = $debug ? $debug-1 : undef;
  if (exists($opt{-SlcContig}) and !int(@{$opt{-SlcContig}})) {
    delete $opt{-SlcContig};
  }
  my $RegexpRelyOn = $opt{-RelyOn};
  my $bTimer = $opt{-timer}; my $time;
  unless ($ArgSrc) {
    die sprintf "%s. ERROR: need a minimum of 1 arg: <ContigSource>\n", &MySub;
  }

  my ($pGapSeqStruct, $pGapDataStruct);
  my ($pCtg, $CtCtg, $pRead, $pAnnotRaw, $pAnnot);

  ##############################################################################
  # get primary data
  # chain depending on source value for primary data

  # source is GAP4 database
  if (-B $ArgSrc) {
    $bTimer and $time = &Sum((times)[0,2]); 
    $pGapSeqStruct = &GapSeqStruct ($ArgSrc, %opt);
    $$pGapSeqStruct{source} = [ {path=>$ArgSrc,type=>'GAP4 DB'} ];
    $bTimer and printf STDERR "%s. CPU time for retrieving GAP4 data (\&GapSeqStruct): %.3f s\n", &MySub, &Sum((times)[0,2])-$time;
    if ($opt{-annotation}) {
      $bTimer and $time = &Sum((times)[0,2]); 
      $pGapDataStruct = &GapDataStruct ($ArgSrc, %opt, -nice=>1);
      $bTimer and printf STDERR "%s. CPU time for retrieving GAP4 data (\&GapDataStruct): %.3f s\n", &MySub, &Sum((times)[0,2])-$time;
    }
  }

  # source is contig index
  elsif (($ArgSrc =~ m/Read.tab/ and -T $ArgSrc and $ArgSrc = $`) or -T ($ArgSrc .'Read.tab')) {
    if ($opt{-annotation}) {
      printf STDERR "%s. ERROR: unable to get primary annotation data from non-GAP4-DB source\n", &MySub;
      return undef;
    }
    $$pGapSeqStruct{source} = [ {path=>$ArgSrc,type=>'contig source index'} ];
    unless ($$pGapSeqStruct{contig} = &PlainToTable ($ArgSrc.'Contig.tab', -TabType=>'AH',
      -ColLabel => ['contig_id','contig_idnum','contig_source','contig_length','read_num'],
      -comments => 1,
      -debug    => $dbg2)) {
      printf STDERR "%s. ERROR: no data in file %s\n", &MySub, $ArgSrc.'Contig.tab';
      return undef;
    }
    unless ($$pGapSeqStruct{read} = &PlainToTable ($ArgSrc.'Read.tab', -TabType=>'AH',
      -ColLabel => ['read_id','contig_source','contig_id','contig_idnum','read_offset','read_length','read_orient','read_true'],
      -comments    => 1,
      -debug       => $dbg2)) {
      printf STDERR "%s. ERROR: no data in file %s\n", &MySub, $ArgSrc.'Read.tab';
      return undef;
    }

    # create first-pass data structure
    # loop over contigs
    foreach $pCtg (@{$$pGapSeqStruct{contig}}) {

      # rename fields
      my $pTmp = { contig_id=>'id', contig_idnum=>'idnum',
        contig_source=>undef, contig_length=>'length', read_num=>undef };
      while (@_ = each %$pTmp) {
        $_[1] and $$pCtg{$_[1]} = $$pCtg{$_[0]};
        delete $$pCtg{$_[0]};
      }

      # group reads into contigs
      $$pCtg{read} = { map { ($_->{read_id}=>$_) }
        map  { $_->{contig} = $pCtg; $_; }
        grep { $_->{contig_id} eq $$pCtg{id} }
        @{$$pGapSeqStruct{read}} };
    }
    # loop over reads
    foreach $pRead (@{$$pGapSeqStruct{read}}) {

      # rename fields
      $$pRead{pos}{'-1'} = $$pRead{read_offset};
      $$pRead{pos}{'1'} = $$pRead{read_offset} + $$pRead{read_length} - 1;
      my $pTmp = { read_id=>'id',
        contig_source=>undef, contig_id=>undef, contig_idnum=>undef,
        read_offset=>undef, read_length=>'length', read_orient=>'orient',
        read_true=>'trace' };
      while (@_ = each %$pTmp) {
        $_[1] and $$pRead{$_[1]} = $$pRead{$_[0]};
        delete $$pRead{$_[0]};
      }
    }
  }

  # source is GAP4 showrelationships report
  elsif (-T $ArgSrc) {
    if ($opt{-annotation}) {
      printf STDERR "%s. ERROR: unable to get primary annotation data from non-GAP4-DB source\n", &MySub;
      return undef;
    }
    $pGapSeqStruct = &GapSeqStruct ($ArgSrc, %opt);
    $$pGapSeqStruct{source} = [ {path=>$ArgSrc,type=>'GAP4 show relationships'} ];
  }

  # source is GAP4 directed assembly file tree
  elsif (0 and -d $ArgSrc and -e "$ArgSrc/.isGapDirectedTree") {
    # not implemented yet
  }

  # undefined contig source type
  else {
    printf STDERR "%s. ERROR: unknown contig set source type for source %s\n", &MySub, $ArgSrc||"''";
    return undef;
  }

  # debug
  if ($debug) {
    printf STDERR "%s. got primary contig set data:\n", &MySub;
    printf STDERR "  contigs: %d (%s)\n", int(@{$$pGapSeqStruct{contig}}),
      join (' ', map { ref($_)?$_->{id}:$_ }
        &ListMaxfirst($$pGapSeqStruct{contig},3,-ElemExceed=>'...'));
    my $CtCtg = 2;
    my $CtRead = 5;
    printf STDERR "  readings: %d (%s)\n",
      int (map{ keys %{$_->{read}} }@{$$pGapSeqStruct{contig}}),
      join (' ', (grep { $_ and ($CtRead--)>0 } map { keys %{$_->{read}} }
        grep { $_ and ($CtCtg--)>0 } @{$$pGapSeqStruct{contig}}), ($CtRead<0) ? '...':());
    if ($opt{-annotation}) {
    printf STDERR "  annotations: %d\n", int @{$$pGapDataStruct{annotation}} - 1;
    }
  }

  ##############################################################################
  # data refining:
  # - contig clone indices
  # - effective contig offset/end
  $bTimer and $time = (times)[0];

  # loop over contigs
  foreach $pCtg (@{$$pGapSeqStruct{contig}}) {

    # construct contig clone index
    $$pCtg{clone} = &ReadTreeIndex ($$pCtg{read}, -format=>'clone', -idref=>1, -debug=>$dbg2);

    # effective offset/end of contig (neglecting foreign reads)
    if ($RegexpRelyOn) {
      foreach $pRead (sort { $a->{pos}{'-1'} <=> $b->{pos}{'-1'} } values %{$$pCtg{read}}) {
        if ($$pRead{id} =~ m/$RegexpRelyOn/) {
          $$pCtg{effend}{'-1'} = $$pRead{pos}{'-1'};
          last;
        }
      }
      foreach $pRead (sort { $b->{pos}{'1'} <=> $a->{pos}{'1'} } values %{$$pCtg{read}}) {
        if ($$pRead{id} =~ m/$RegexpRelyOn/) {
          $$pCtg{effend}{'1'} = $$pRead{pos}{'1'};
          last;
        }
      }
      $debug and printf STDERR "%s. reliable ends of contig %s: 5' %d, 3' %d\n", &MySub,
        $$pCtg{id}||"''", $$pCtg{effend}{'-1'}, $$pCtg{effend}{'1'};
    } else {
      $$pCtg{effend}{'-1'} = 1;
      $$pCtg{effend}{'1'}  = $$pCtg{length};
    }
  }

  ##############################################################################
  # selection:
  # - minimum clone number in contig
  # - target specificity

  # select for minimum clone number in contig
  if ($opt{-SlcCnum}) {
    $debug and printf STDERR "%s. select contigs according to clone number\n", &MySub;
    @{$$pGapSeqStruct{contig}} = grep { int(keys(%{$_->{clone}})) >= $opt{-SlcCnum} }
      @{$$pGapSeqStruct{contig}};
  }
  # select for minimum read number in contig
  if ($opt{-SlcRnum}) {
    $debug and printf STDERR "%s. select contigs according to read number\n", &MySub;
    @{$$pGapSeqStruct{contig}} = grep { int(keys(%{$_->{read}})) >= $opt{-SlcRnum} }
      @{$$pGapSeqStruct{contig}};
  }

  # select for target specificity
  $debug and printf STDERR "%s. source specificity selection: %s\n", &MySub, $opt{-SlcSpecif} ? 'YES':'NO';
  if ($opt{-SlcSpecif}) {
    foreach $pCtg (@{$$pGapSeqStruct{contig}}) {
      &ContigSpecif ($pCtg, $opt{-SlcSpecif},
        -delta=>$opt{-SlcSpecifDlt}, -rc=>$opt{-RcTgtSpecif},
        -debug=>$debug) and
        push @{$$pGapSeqStruct{contignew}}, $pCtg;
    }
    $$pGapSeqStruct{contig} = $$pGapSeqStruct{contignew};
  }

  ##############################################################################
  # global indices
  # - reads
  # - clones
  # data reorganisation
  # - refine contig data sub-structures
  # - pack contigs into hash

  # construct root read index
  # loop over contigs
  delete $$pGapSeqStruct{read};
  foreach $pCtg (@{$$pGapSeqStruct{contig}}) {
    # the following is much faster than
    # %{$$pGapSeqStruct{read}}  = ( %{$$pGapSeqStruct{read}}, %{$$pCtg{read}} );
    # cause processed in linear time with rising number of contigs
    while (@_ = each %{$$pCtg{read}}) {
      $$pGapSeqStruct{read}{$_[0]} = $_[1];
    }
  }

  # construct root clone index
  # this has to be made from scratch cause reads from one clone may be spread
  #   over several contigs
  $$pGapSeqStruct{clone} = &ReadTreeIndex ($$pGapSeqStruct{read},
    -format=>'clone', -idref=>1, -debug=>$dbg2);

  # add root reference to contig data structure
  # turn array of contigs into hash
  $$pGapSeqStruct{contig} = { map { ($_->{id}=>$_) }
    map {
      $_->{root} = $pGapSeqStruct;
      $_->{source} = $$pGapSeqStruct{source}[0];
      $_;
    } @{$$pGapSeqStruct{contig}} };

  $bTimer and printf STDERR "%s. CPU time for standard expansion of contig data: %.3f s\n", &MySub, (times)[0]-$time;

  ##############################################################################
  # enter annotations

  if ($opt{-annotation}) {
    $bTimer and $time = (times)[0];

    # features that are not required in $pGapDataStruct
    foreach my $ItField (qw(annotation reading contig)) {
      foreach my $pEntry (@{$$pGapDataStruct{$ItField}}) {
        foreach (qw(next prev ref_base referee)) {
          delete $$pEntry{$_};
        }
      }
    }

    # loop over contigs
    foreach $pCtg (@{$$pGapDataStruct{contig}}) {
      unless ($pCtg and %$pCtg) {
        printf STDERR "ERROR: found undefined contig (%s)\n", $pCtg;
        next;
      }

      # work around deselected contigs
      if ($opt{-SlcContig} and not exists $$pGapSeqStruct{contig}{$$pCtg{id}}) { next }

      # grab first annotation in list for contig
      if ($pCtg and %$pCtg and $$pCtg{annotations}||0 and
          $pAnnotRaw=$$pGapDataStruct{annotation}[$$pCtg{annotations}]
      ) {
        { # redo block for chain of annotations

          # derive annotation entry from raw data
          $pAnnot = { %$pAnnotRaw };
          $$pAnnot{contig} = $$pGapSeqStruct{contig}{$$pCtg{id}};
          unless ($$pAnnot{contig}) {
            printf STDERR "ERROR: annotation %d resides on unknown contig (%s)\n",
              $$pAnnot{data_num}, $$pCtg{id}||"''";
          }
          $$pAnnot{corient} = $$pAnnot{strand} ? -1 : 1;
          $$pAnnot{cpos} = $$pAnnot{position};
          foreach (qw(data_num data_type position strand)) {
            delete $$pAnnot{$_};
          }

          # debug
          if ($debug) {
            printf STDERR "contig annotation (contig %s)  {\n", $$pAnnot{contig}{id};
            while (@_ = each %$pAnnot) {
              print STDERR "  $_[0]  $_[1]\n";
            }
            print  STDERR "}\n";
          }

          # enter entry
          foreach ($pGapSeqStruct, $$pAnnot{contig}) {
            push @{$_->{annotation}}, $pAnnot;
          }

          # follow next annotation in chain
          if ($$pAnnotRaw{next} and
              $pAnnotRaw=$$pGapDataStruct{annotation}[$$pAnnotRaw{next}]) {
            redo;
          }
        }
      }  # end: if ($pCtg .. $pAnnotRaw=$$pGapDataStruct...)
    }

    # loop over readings
    foreach $pRead (@{$$pGapDataStruct{reading}}) {
      unless ($pRead and %$pRead) {
        printf STDERR "ERROR: found undefined read (%s)\n", $pRead||'_undef_';
        next;
      }

      # work around deselected contigs
      if ($opt{-SlcContig} and !exists($$pGapSeqStruct{read}{$$pRead{id}})) { next }

      # grab first annotation in list for reading
      if ($pRead and %$pRead and $$pRead{annotations}||0 and
          $pAnnotRaw=$$pGapDataStruct{annotation}[$$pRead{annotations}]
      ) {
        { # redo block for chain of annotations

          # determine referees for annotation
          $pAnnot = { %$pAnnotRaw };
          $$pAnnot{read} = $$pGapSeqStruct{read}{$$pRead{id}};
          unless ($$pAnnot{read}) {
            printf STDERR "ERROR: annotation %d resides on unknown reading (%s)\n",
              $$pAnnot{data_num}, $$pRead{id}||"''";
            next;
          }
          $$pAnnot{contig} = $$pAnnot{read}{contig};
          $debug and printf STDERR "%s. annotation %d on: reading %s/%d, contig %s/%d\n", &MySub,
            $$pAnnot{data_num}, $$pAnnot{read}{id}||"''", $$pAnnot{read}{idnum},
            $$pAnnot{contig}{id}||"''", $$pAnnot{contig}{idnum};

          # calculate positional information
          $$pAnnot{position} -= ($$pAnnot{read}{orient} > 0) ?
            $$pGapDataStruct{reading}[$$pAnnot{read}{idnum}]{start} :
            $$pGapDataStruct{reading}[$$pAnnot{read}{idnum}]{length} - $$pGapDataStruct{reading}[$$pAnnot{read}{idnum}]{start} - $$pAnnot{read}{length};
          $debug and printf STDERR "%s. annotation position corrected rightwards (hidden effect): %d -> %d, diff %d\n", &MySub,
            $$pAnnotRaw{position}, $$pAnnot{position},
            $$pAnnot{position}-$$pAnnotRaw{position};
          $$pAnnot{rorient} = $$pAnnot{strand} ? -1 : 1;
          $$pAnnot{rpos} = $$pAnnot{position};
          $$pAnnot{corient} = $$pAnnot{rorient} * $$pAnnot{read}{orient};
          $$pAnnot{cpos} = $$pAnnot{read}{pos}{-$$pAnnot{read}{orient}}
            + ($$pAnnot{position}-1) * $$pAnnot{read}{orient}
            - (($$pAnnot{read}{orient}<0) ? $$pAnnot{length}-1 : 0);

          # delete unneeded information
          foreach (qw(data_num data_type position strand)) {
            delete $$pAnnot{$_};
          }

          # debug
          if ($debug) {
            printf STDERR "read annotation (contig %s, read %s): {\n",
              $$pAnnot{contig}{id}, $$pAnnot{read}{id};
            while (@_ = each %$pAnnot) {
              print STDERR "  $_[0]  $_[1]\n";
            }
            print STDERR "}\n";
          }

          # enter entry
          foreach ($pGapSeqStruct, $$pAnnot{contig}, $$pAnnot{read}) {
            push @{$_->{annotation}}, $pAnnot;
          }

          # follow next annotation in chain
          if ($$pAnnotRaw{next} and
              $pAnnotRaw = $$pGapDataStruct{annotation}[$$pAnnotRaw{next}]) {
            redo;
          }
        }
      }  # end: if ($pRead .. $pAnnotRaw=$$pGapDataStruct...)
    }

    # tidy up
    &DataDecross ($pGapDataStruct);
    $bTimer and printf STDERR "%s. CPU time for entering annotations: %.3f s\n", &MySub, &Sum((times)[0,2])-$time;
  }

  ##############################################################################
  # analyse contig ends
  if (defined $opt{-cend}) {

    # loop over contigs
    foreach $pCtg (values %{$$pGapSeqStruct{contig}}) {
      unless ($$pCtg{effend}{'-1'}) { next }
      $CtCtg ++;  # currently, never used, later on

      # contig end data structure
      foreach my $ItEnd (1, -1) {
        $$pCtg{end}{$ItEnd} = {
          contig => $pCtg,
          idnum  => $ItEnd,
          };
        $$pCtg{end}{$ItEnd}{id} = &CendId ($$pCtg{end}{$ItEnd}, -format=>'compact');
      }

      # loop over all reliable reads in contig
      foreach $pRead (values %{$$pCtg{read}}) {
        if ($RegexpRelyOn and $$pRead{id} !~ m/$RegexpRelyOn/) { next }

        my $ItEnd = $$pRead{orient};
        # calculate contig end distance for ALL reads
        $$pRead{CEndDist} = abs ($$pRead{pos}{-$ItEnd} - $$pCtg{effend}{$ItEnd}) + 1;
        # determine contig end for selection of reads
        if ($opt{-cend} and $$pRead{CEndDist} > $opt{-cend}) { next }
        $$pRead{cend} = $$pCtg{end}{$ItEnd};
        push @{$$pCtg{end}{$ItEnd}{read}}, $pRead;
      }
    }
  }

  ##############################################################################
  # analyse pairing status

  $opt{-pair} and &ContigStructClone ($$pGapSeqStruct{clone});

  ##############################################################################
  # debugging

  if ($debug) {
    # NOTE: with back-references in data structure &DataPrint doesn't work
    printf STDERR "%s. final data structure:\n", &MySub;
    printf STDERR "  sources: %d - 1st %s\n", int(@{$$pGapSeqStruct{source}}), $$pGapSeqStruct{source}[0]{path};
    printf STDERR "  contigs: %d\n", int(keys %{$$pGapSeqStruct{contig}});
    printf STDERR "  clones: %d\n", int(keys %{$$pGapSeqStruct{clone}});
    printf STDERR "  readings: %d\n", int(keys %{$$pGapSeqStruct{read}});
    if ($opt{-annotation}) {
    printf STDERR "  annotations: %d\n", int(@{$$pGapSeqStruct{annotation}})-1;
    }
  }

  # exit SUB
  return $pGapSeqStruct;
}


################################################################################
# manage contigs in contig set data
################################################################################


# work out contig specifier
#
# INTERFACE
# - argument 1: reference to contig data structure
#
# - options:
#   -debug      [STD]
#   -join       use specified string to join field labels in specifier
#               construction
#   -source     include source specifier in specifier construction.
#               Optional switch value 'short' tells the function only to
#               add the file name of the source rather than the full
#               path (default).
#
# - return val: - contig specifier
#               - undef if an error occurs
#
sub ContigId {
  my ($pCtg, %opt) = @_;
  my ($debug);
  my ($pSrc, %spec, $ContigSpec);

  # function parameters
  unless ($pCtg and %$pCtg) { return undef }
  $opt{-join} ||= ' ';

  # source
  if ($opt{-source}) {
    $pSrc = $$pCtg{source} or return undef;
    $spec{source} = ($opt{-source} eq 'short') ?
      &PathSplit($$pSrc{path})->{name} : $$pSrc{path};
  };

  # contig ID alphanumeric
  $spec{contig} = $$pCtg{id} or return undef;

  # contig ID numeric
  if ($$pCtg{idnum}) {
    $spec{contig} .= ' / #' . $$pCtg{idnum};
  };

  # join fields
  $ContigSpec = join ($opt{-join}, values %spec);

  # exit SUB
  return $ContigSpec;
}


# get sequence for contig
#
# INTERFACE
# - argument 1: reference to contig data structure
#
# - options:
#   -debug      [STD]
#   -annot      include annotations in returned sequence data structure
#
# - return val: - reference to sequence data structure
#               - undef if an error occurs
#
# DESCRIPTION
# - all contig sequences of the same source like the specified contig will be
#   loaded at the same time (globally accessible via
#   $$pCtgStruct{contig}{*}{sequence}).
#
# DEVELOPER'S NOTES
# - this function adds kind of an OO feature to the assembly data structure.
#
sub ContigSeq {
  require SeqLab::SeqStreamIn;
  my ($pCtg, %opt) = @_;
  my ($debug, $pCtgStruct, $pSrc);

  # function parameters
  $debug = $opt{-debug};
  $debug and printf STDERR "%s. entered sub\n", &MySub;

  $$pCtg{sequence} and return $$pCtg{sequence};
  $pCtgStruct = $$pCtg{root} or return undef;
  $pSrc = $$pCtg{source} or return undef;
  if ($$pSrc{type} ne 'GAP4 DB') { return undef }

  # retrieve contig sequences
  my $PathSeq = $_LibParam{TmpManag}->Create(-touch=>1);
  my $call = join (' ', $CorePath{call}{GapExportCons}, '-f Experiment',
    $$pSrc{path}, $PathSeq);
  if (&GapSafeCall ($call)) {
    printf STDERR "%s. ERROR: GAP4 consensus call failed: $call\n", &MySub;
    exit 1;
  }
  my $pSeqQueue = SeqLab::SeqStreamIn->new($PathSeq);
  while (my $pSeq = $pSeqQueue->GetNext()) {
    $$pCtgStruct{contig}{$$pSeq{id}} or next;
    delete $$pSeq{AV};
    unless ($opt{-annot}) {
      delete $$pSeq{annot};
    }
    $$pCtgStruct{contig}{$$pSeq{id}}{sequence} = $pSeq;
  }

  # tidy up, exit SUB
  unlink ($PathSeq);
  return $$pCtg{sequence};
}


# return source specificity match for contig
#
# INTERFACE
# - argument 1:   reference to contig set data structure
# - argument 2:   source specifier (case-sensitive)
#
# - options:
#   -debug        [STD]
#   -delta        minimum delta of specificity measure, default: 0
#   -RcTgtSpecif  use this rc file for target specificity library
#
# - return val:   - specificity flag (boolean)
#                 - undef if an error occurs
#
# DEBUG, CHANGES, ADDITIONS
# - the calculation applies to the source/target group 'genome'.
#   This is a rather heuristic implementation. However, in the
#   Dictyostelium project we have some non-genomic DNA in our libraries
#   and we don't want them to be regarded in the calculation.
# - Shouldn't we consider permutations in the combinatorical approach?
#
sub ContigSpecif {
  my ($pCtg, %src, %opt);
     ($pCtg, $src{check}, %opt) = @_;
  my ($debug);
  my ($pSpecif, $delta);

  # function parameters
  $debug = $opt{-debug};
  unless (grep { $_ eq $src{check} }
    &TgtspecifTgtRegard ('genome', -rc=>$opt{-RcTgtSpecif})
  ) {
    printf STDERR "%s. ERROR: undefined source/target specifier %s\n", &MySub, $src{check}||"''";
    return undef;
  }

  # get source specificty info data structure
  $pSpecif = &TgtspecifTgtProbab ([keys %{$$pCtg{read}}],
    'Read', -target=>'genome', -rc=>$opt{-RcTgtSpecif});
  unless (%$pSpecif) {
    printf STDERR "%s. ERROR: found no source/target for contig %s\n", &MySub, $$pCtg{id}||"''";
    return undef;
  }

  # determine winner and 2nd candidate for source specificty
  ($src{won}, $src{'2nd'}) = (sort { $$pSpecif{$b} <=> $$pSpecif{$a} } keys %$pSpecif);
  if ($debug) {
    printf STDERR "%s. contig %s specific for %s, 2nd candidate %s\n", &MySub,
      $$pCtg{id}||"''", $src{won}||"''", $src{'2nd'}||"''";
    &DataPrint ($pSpecif, -handle=>\*STDERR);
  }

  # determine specificity match
  $delta = ($src{won} eq $src{check}) ?
    ($$pSpecif{$src{won}}   - $$pSpecif{$src{'2nd'}}) :
    ($$pSpecif{$src{check}} - $$pSpecif{$src{won}});
  $debug and printf STDERR "%s. delta specificity measure: %s (limit %s)\n", &MySub, $delta, $opt{-delta}||0;
  return ($delta > $opt{-delta}) ? 1 : 0;
}


################################################################################
# manage contig ends in contig set data
################################################################################


# work out contig end identifier/specifier
#
# INTERFACE
# - argument 1: reference to contig end data structure
#
# - options:
#   -debug      [STD]
#   -format     format type in specifier construction:
#               compact  compact (default)
#               loose    loose
#   -join       use specified string to join field labels in specifier
#               construction
#   -label      include specifier field labels in specifier construction
#               This options doesn't work with -format=>'compact'.
#   -source     include source specifier in specifier construction
#               Optional switch value 'short' tells the function only to
#               add the file name of the source rather than the full
#               path (default).
#
# - return val: - contig end identifier/specifier
#               - undef if an error occurs
#
sub CendId {
  my ($pCend, %opt) = @_;
  my ($debug, %JoinDefault);
  my ($pSrc, %spec, $CendSpec);

  # function parameters
  unless ($pCend and %$pCend) { return undef }
  $opt{-format} ||= 'compact';

  # source
  if ($opt{-source}) {
    $pSrc = $$pCend{contig}{source} or return undef;
    $spec{source} = ($opt{-format} eq 'compact' or $opt{-source} eq 'short') ?
      &PathSplit($$pSrc{path})->{name} : $$pSrc{path};
  }

  # contig ID
  $spec{contig} = $$pCend{contig}{id} or return undef;
  if ($opt{-format} ne 'compact' and $$pCend{contig}{idnum}) {
    $spec{contig} .= ' / #' . $$pCend{contig}{idnum};
  }

  # end specification
  $spec{end} = $SyntaxTranslNtdrc{End2Prime}{$$pCend{idnum}};

  # include field labels
  if ($opt{-label} and $opt{-format} ne 'compact') {
    map { $spec{$_} = "$_: $spec{$_}"; } values %spec;
  }

  # join fields
  %JoinDefault = ( compact=>'-', loose=>', ' );
  $opt{-join} ||= $JoinDefault{$opt{-format}};
  $CendSpec = join ($opt{-join}, grep { defined($_) } @spec{'source','contig','end'});

  # exit SUB
  return $CendSpec;
}


# sort function for contig end identifiers
#
# DEBUG, CHANGES, ADDITIONS
# - sort for contig identifier in an intelligent way
#
sub CendIdSort {
  my @CendId;
     ($CendId[0], $CendId[1]) = @_;
  my (@ContigId, @CendNum, $ItCend);

  # function parameters

  # split cend specifier into fields 'contig'/'end'
  foreach $ItCend (0, 1) {
    ($ContigId[$ItCend], undef, $CendNum[$ItCend])
      = ($CendId[$ItCend] =~ m/(^.+)[_-]((\d)'?)$/);
  };

  # sort
  return ( $ContigId[0] cmp $ContigId[1] or
            $CendNum[1] <=> $CendNum[0] );
}


# return contig counter-end of given contig end
#
# INTERFACE
# - argument 1: reference to contig end data structure
# - return val: - reference to contig end data structure
#               - undef if an error occurs
#
sub CendCntrCend {
  my ($pCend) = @_;
  my ($pCendCntr);

  # function parameters
  unless ($pCend and %$pCend) { return undef }

  # counter-contig end
  $pCendCntr = $$pCend{contig}{end}{-$$pCend{idnum}} ||= {
    idnum  => -$$pCend{idnum},
    contig => $$pCend{contig},
    };
  $$pCendCntr{idnum} ||= sprintf ('%s_%s',
    $$pCendCntr{idnum}, $SyntaxTranslNtdrc{End2PrimeNum}{$$pCendCntr{idnum}});

  # exit SUB
  return $pCendCntr;
}


################################################################################
# manage reads in contig set data
################################################################################


# expand clone entry in global index of assembly data structure
#
# INTERFACE
# - argument 1: reference to clone index data sub-structure
#
# - options:
#   -debug      [STD]
#
# DESCRIPTION
# - see description of contig set data structure for definition of
#   SeqStatus and PairStatus.
# - this function analyses the distribution of reads over several contigs
#   So, this code cannot apply to a non-ContigStruct clone index.
#
# DEBUG, CHANGES, ADDITIONS
# - instead of PairStatus/PairError
#   $clone{$prm}{ContigDisperse}  read type overlaps several contigs (error)
#   $clone{ContigDisperse}        clone overlaps several contigs
#                                 the analysis takes into account read type
#                                 dispersion (regarded as error => filtered)
#
sub ContigStructClone {
  my ($pCloneIdx, %opt) = @_;
  my ($debug);
  my ($pClone, %CloneCtg);

  # function parameters
  $debug = $opt{-debug};

  # loop over clones in index
  foreach $pClone (values %$pCloneIdx) {

    # sequencing status: both insert ends sequenced?
    if (exists($$pClone{'1'}) and %{$$pClone{'1'}} and
        exists($$pClone{'-1'}) and %{$$pClone{'-1'}}
    ) {
      $$pClone{SeqStatus} = 1;

      # are clone ends on their own dispersed over several contigs?
      # => surely inconsistencies
      foreach my $ItPrd (qw(1 -1)) {
        $CloneCtg{$ItPrd} = &DataTreeSlc ($$pClone{$ItPrd}, [[0,'all'],['contig']], -unique=>1);
        $debug and printf STDERR "%s. contigs for clone %s, primer direction %s:\n  %s\n", &MySub,
          $$pClone{id}, $ItPrd, join (' ', map{$_->{id}}@{$CloneCtg{$ItPrd}});
        if (@{$CloneCtg{$ItPrd}} > 1) { push @{$$pClone{DispsPrd}}, $ItPrd; }
      }
      if (@{$$pClone{DispsPrd}||[]}) {
        $$pClone{PairStatus} = 0;  # surely inconsistent
        $$pClone{PairError} = sprintf ("contig dispersion of clone ends on its own: %s",
          join (' ', map{sprintf('%+d',$_)} @{$$pClone{DispsPrd}}) );
      } elsif ($CloneCtg{'1'}[0] eq $CloneCtg{'-1'}[0]) {
        $$pClone{PairStatus} = 1;  # consistently paired, residing in single contig
        $$pClone{DispsClnEnd} = 0;
      }
      # pair of clone ends dispersed over two contigs
      else {
        $$pClone{PairStatus} = 0.5;  # unknown consistency of pair status
        $$pClone{DispsClnEnd} = 1;
      }
      $CloneCtg{clneIdx} = { map{($_=>1)} map{@{$_||[]}}
        &DataTreeSlc ([@{$CloneCtg{'1'}},@{$CloneCtg{'-1'}}], [[0,'all']], -unique=>1)
        };

      # analyse dispersion of all clone reads over several contigs
      @{$CloneCtg{DispsPlusOther}} = grep { !$CloneCtg{clneIdx}{$_} } map {@{$_||[]}}
        &DataTreeSlc ($$pClone{'0'}, [[0,'all'],['contig']], -unique=>1);
      $$pClone{DispsRead} = int(
        @{$$pClone{DispsPrd}||[]} or $$pClone{DispsClnEnd} or @{$$pClone{DispsPlusOther}||[]}
        );

      # even though single reads may be wrongly named (and this probability
      # rises with the number of reads per clone!), consistent pairs may be
      # found in the read pool!
      # *** implement me ***
    }

    # not both insert ends sequenced
    else {
      $$pClone{SeqStatus} = 0;

      # clone reads dispersed over contigs?
      $CloneCtg{all} = &DataTreeSlc ($pClone, [[0,'all'],[0,'all'],['contig']],
        -unique=>1);
      $$pClone{DispsRead} = int (@{$CloneCtg{all}} > 1);
    }
  }
}


# eliminate read from contig set data structure
#
# INTERFACE
# - argument 1: reference to read data sub-structure
#
# - options:
#   -debug      [STD]
#
# - return val: success status (boolean)
#
# DEBUG, CHANGES, ADDITIONS
# - delete annotations corresponding to the reading
# - update clone nodes according to deleted primer direction
#
sub ContigStructReaddel {
  my ($pRead, %opt) = @_;
  my ($pCtgStruct, $debug);
  my ($pField, $pCloneIdx);

  # function parameters
  $debug = $opt{-debug};
  $pCtgStruct = $$pRead{contig}{root};

  # delete entry in read indices: contig, contig end, root
  delete $$pRead{contig}{read}{$$pRead{id}};
  if (exists $$pRead{cend}) {
    @{$$pRead{cend}{read}} = grep { $_ ne $pRead } @{$$pRead{cend}{read}};
  }
  delete $$pCtgStruct{read}{$$pRead{id}};

  # delete entry in clone indices: contig, root
  $pField = &ReadidToFields ($$pRead{id});
  $debug and printf STDERR "%s. read ID fields: clone %s, "
    ."primer %s, primer direction %d, number %d, read suffix %s\n", &MySub,
    @{$pField}{'cln','prm','prd','num','re'};
  foreach $pCloneIdx ($$pRead{contig}{clone}, $$pCtgStruct{clone}) {
    $debug and printf STDERR "%s. deleting read type entry in clone index\n", &MySub;
    delete $$pCloneIdx{$$pField{cln}}{$$pField{prd}}{$$pField{re}};
    unless (%{ $$pCloneIdx{$$pField{cln}}{$$pField{prd}} }) {
      $debug and printf STDERR "%s. deleting clone end entry in clone index\n", &MySub;
      delete $$pCloneIdx{$$pField{cln}}{$$pField{prd}};
      unless (@{ &DataTreeSlc($$pCloneIdx{$$pField{cln}},[['^-?[01]$','regexp'],[0,'all'],['orient','regexp']]) || [] }) {
        delete $$pCloneIdx{$$pField{cln}};
      }
    }
  }

  # destroy read sub-structure
  %$pRead = ();
}


1;
# $Id: Assembly.pm,v 1.21 2018/06/05 18:02:56 szafrans Exp $
