#! /usr/local/bin/perl
################################################################################
#
#  GAP4 Database Handling in the GSCJ Genome Environment
#
#  copyright (c)
#    Fritz Lipmann Institute Jena, Genome Analysis Group, 2006, 2013
#    Institute of Molecular Biotechnology Jena, Dept. Genome Analysis, 2000-2004
#  author
#    Karol Szafranski, karol.szafranski@leibniz-fli.de
#
################################################################################
#
#  DESCRIPTION
#
# - See function &usage for description of command line syntax
#
# - each function comes along with a description at the beginning of the code
#   block
#
################################################################################
#
#  FUNCTIONS, DATA
#
# - MAIN
#   %GlobStore
#   $ProgFile,$ProgFstump
#   %ProgParam
#   $ProgMode,%ProgOpt,@ProgArg
#
# - usage help, command line arguments
#   &usage
#   &AddSwitch
#
# - analyse/manipulate GAP4 assembly
#   &ProgExpTranspos
#   &ProgScfNice
#   &ProgScfCure
#   &ScfCureInfo
#
# - perform assembly
#   &ProgSeqprimEnter
#   &ProgAssembCplx
#   &ProgAssembSingles
#
# - annotation concepts
#   &ProgListOligo
#   &ProgListGene
#   &ProgAnnotGlimmerM
#   &ProgAnnotReduce
#
################################################################################
#
#  DEBUG, CHANGES, ADDITIONS
#
# - some program modes should be turned into stand-alone scripts
#   -ExpTranspos
#
# - look also for notes in the header of each function block
#
################################################################################


# global constants and variables
# also used by modules as a unique global data anchor
our %GlobStore;

# include path(s), includes
use strict; #use warnings;  # OK 20040813
use Cwd qw(realpath);
BEGIN {
  unshift @INC, grep{$_} split(/:+/,$ENV{KPERLPATH}||$ENV{PERLPATH}||'');
  my ($mypath) = realpath(__FILE__)=~m/(^.*)\//;
  push @INC, $mypath;
}
use MainLib::StrRegexp;
use MainLib::Data;
use MainLib::Path;
use MainLib::Cmdline qw(&GetoptsNArgs);
use MainLib::File;
use MainLib::FileTmp;
use MainLib::FileAccAsync;
use MainLib::Misc;
use Math::kCalc;
use database::DbPlain;
use SeqLab::SeqBench;
use SeqLab::SeqFormat;
use SeqLab::SeqStreamIn;
use SeqLab::SeqStreamOut;
use SeqLab::MotifRE;
use SeqAlign::Gap;
use SeqAlign::Assembly;
use ReadWatch::Read;
use ReadWatch::ReadIndex;
use ReadWatch::Library;
use ReadWatch::Cluster;


# script ID
# - program name as specified on the command line
our $ProgFile = ( split('/',__FILE__) )[-1];
our $ProgFstump=$ProgFile; $ProgFstump=~s/\.\w{1,4}$//;

# global constants (esp. default values)
our %ProgParam;
$ProgParam{path}{GapTranspos} = 'transpos.0';
$ProgParam{call}{mask} = {
  rept => $CorePath{call}{PerlScript} .'/'. 'Blast.pl -MaskSeq -program=blastn2 -db=dicty.mask -param=DictyMask  -ThreshID=0.885 -MaskPoly=10',
  trna => $CorePath{call}{PerlScript} .'/'. 'Blast.pl -MaskSeq -program=blastn2 -db=dicty.trna -param=DictyMask  -ThreshID=0.900',
  };
$ProgParam{default}{ProgMode} = '';

# working desk
$ProgParam{store} = undef;

# ensure GAP4 Tcl package available
unless (-d $CorePath{call}{GapScript}) {
  die sprintf "ERROR: GAP4 script path %s\n", $CorePath{call}{GapScript} ? 'not found':'undefined';
}


# manage I/O #####################################

# organiz I/O handles
&Unbuffer();

# organiz tmp files
# - encapsulation in a sub{} allows MainLib::FileTmp to identify the calling
#   package
my $pcFT = sub{ $ProgParam{TmpManag} = MainLib::FileTmp->new(); };
&$pcFT;


# command line interface #########################
# NOTE:
# - &GetoptsNArgs (via &AddSwitch) modifies global variables:
#   $ProgMode @ProgArg %ProgOpt
# - @ProgArg may be pre-filled in &GetoptsNArgs via &AddSwitch (option -fofn)

# arguments, switches, default subprogram
our $ProgMode = undef;
our @ProgArg = ();
our %ProgOpt = ();
unshift @ProgArg, &GetoptsNArgs();
$ProgMode ||= $ProgParam{default}{ProgMode};

# eventually open LOG file
if ($ProgOpt{-log}) {
  $ProgOpt{LogFile} = ($ProgOpt{-log} ne '1')? $ProgOpt{-log} : undef;
  &LogOpen (-file=>$ProgOpt{LogFile}, -stamp=>$ProgFstump, -prog=>"$ProgFile -$ProgMode");
}
END {
  $ProgOpt{-log} and &LogClose();
}


# work flow manifold #############################

# chain to program mode which is optionally argument-less
if (0) { }
elsif (!@ARGV or $ProgMode=~m/^h(elp)?$/i) { &usage() }
elsif ($ProgMode =~ m/^(Annot|Tag)Reduce$/i) {
  &ProgAnnotReduce (@ProgArg);
  exit 0;
}
elsif ($ProgMode =~ m/^ScfCure$/i) {
  &ProgScfCure (@ProgArg);
  exit 0;
}
elsif ($ProgMode =~ m/^ScfNice$/i) {
  &ProgScfNice (@ProgArg);
  exit 0;
}

# ensure input argument(s)
unless (@ProgArg) {
  print STDERR "ERROR: input arguments missing\n";
  exit 1;
}

# chain to program mode (with input argument(s))
if (0) { }
elsif ($ProgMode =~ m/^AnnotGlimmerM$/i) {
  &ProgAnnotGlimmerM (@ProgArg);
}
elsif ($ProgMode =~ m/^AssembCplx$/i) {
  &ProgAssembCplx ($ProgArg[0]);
}
elsif ($ProgMode =~ m/^AssembSingles$/i) {
  &ProgAssembSingles (@ProgArg);
}
elsif ($ProgMode =~ m/^ExpTranspos$/i) {
  &ProgExpTranspos ($ProgArg[0]);
}
elsif ($ProgMode =~ m/^ListGene$/i) {
  &ProgListGene (@ProgArg);
}
elsif ($ProgMode =~ m/^ListOligos?$/i) {
  &ProgListOligo (@ProgArg);
}
elsif ($ProgMode =~ m/^SeqPrimerEnter$/i) {
  &ProgSeqprimEnter (@ProgArg);
}
else {
  print STDERR "ERROR: unknown program mode or switch '$ProgMode'\n";
  exit 1;
}

# exit script successfully
# cf. END blocks!
exit 0;


################################################################################
# usage help, command line arguments
################################################################################


sub usage {
  print "\n";
  print <<END_USAGE;
DESCRIPTION
 $ProgFile is desired for handling sequencing result folders and files,
 especially in the Dictyostelium genome project.

COMMAND LINE SYNTAX
 $ProgFile  -<ModeSwitch> [-<OptionalSwitch> ...] <Arg1> [<Arg2> ...]

Arguments
---------
 See function descriptions.

path arguments:
 Path "-" resolves to STDIN or STDOUT, depending on the context.

ModeSwitch (case-insensitive)
-----------------------------
<none>            default ModeSwitch -$ProgParam{default}{ProgMode} if program arguments are given.
                  Otherwise like ModeSwitch -help.
-AnnotGlimmerM    introduce gene tagging for GlimmerM predictions in a
                  GAP4 database.
                  *** outdated ***: But, this may be turned into tagging of 
                  geneid gene models.
                  NOTE: in current version, Experiment format tags are written
                  to STDOUT.
                  Arg1        GAP4 database: (folder/)database.version
                  Arg2*       optional: contig specifier(s)
                  -MaskTag
-AnnotReduce      for a batch of Experiment files: reduce redundant tags.
                  Arg1*       list of Experiment files, default: files listed
                              in ./fofn & 'ContigTags'
-AssembCplx       perform cyclic procedure of "Gap.pl -AddRev ..." and
                  "GapAddBlast.sh ..."
                  *** out of curation ***
                  Arg1        GAP4 database: (folder/)database.version
-AssembSingles    perform assembly of singlet reads via cyclic procedure of 
                  "Gap.pl -AddRev ..." and "GapAddBlast.sh ...".
                  *** out of curation ***
                  Arg1        target GAP4 database: (folder/)database.version
                  Arg2*       limit number of reads
-ExpTranspos      export transposon project contig to analysis/presentation
                  sub-directory. Perform standard analyses.
                  Arg1        transposon ID
-h(elp)           output command line syntax description and exit
-ListGene         report tagged genes in a GAP4 database
                  Arg1        GAP4 database: (folder/)database.version
                  Arg2*       optional: contig specifier(s)
-ListOligo        list oligo information from annotations
                  Arg1        contig data source
                  Arg2*       optional: contig specifier(s) for selection
-ScfCure          restore broken SCF links in a GAP4 project folder, applied to
                  all *.0 databases in that folder. Before running the program,
                  change into the GAP4 project folder and close all GAP4
                  processes for databases in that project folder.
                  Args        NONE
-ScfNice          turn SCF links in a GAP4 project folder to nice syntax
                  Arg1*       GAP4 project folder
-SeqPrimerEnter   provide manual assembly recipee for primer-derived sequences.
                  Outputs to STDOUT, starts editor (for recipee) and GAP4 (for
                  manual assembly) processes.
                  Arg1        path of report produced by
                              $ProgFile -ListOligo
                  Arg2+       GAP4 database(s): (folder/)database.version
-TagReduce        synonym of -AnnotReduce

OptionalSwitch (case-insensitive)
---------------------------------
switch argument types: B:=boolean, F:=floating point/scientific, N:=integer,
S:=string, X:=varying type.

-debug(=N)        print debug protocol to STDERR (sometimes STDOUT). Keep
                  temporary files.
                  N           debug depth value
-fofn=S           supply list of command arguments in a file. The entries of the
                  file will be appended to the argument list. Multiple -fofn
                  switch statements are allowed.
-log(=S)          redirect STDOUT and STDERR to LOG file
                  S           log file path, default path worked out
                              automatically from built-in directives.
-MaskTag=S        replace tagged sequence ranges by Unknowns. Switch argument
                  is a comma-delimited list of the tag labels that are meant to
                  be masked.
-OutStump=S       path stump for multi-file output. A default is derived from
                  input file names.
-RcReadIndex=S    use read index rc file(s) (comma-delimited switch arguments),
                  default: $ReadWatch::ReadIndex::LibGlob{default}{index}
-SlcOligo=S       specify a RegExp which shall be used for oligo ID selection
-timer            print time-performance protocol to STDERR
-v(erbose)        print extended action protocol to STDOUT / STDERR

Environment Variables
---------------------
 \$PERLPATH        primary search path for Perl package look-up
 \$TEMPPATH        directory for storage of temporary files, default /tmp

Temporary Files
---------------
 The program may generate temporary files. These will be placed either in a
 directory specified by \$ENV{TEMPPATH} or in /tmp.
END_USAGE
  print "\n";
  exit 0;
}


# add program switches to global table (hash)
#
# INTERFACE
# - argument 1:  switch argument without leading '-'
#
# - global data:
#   $ProgMode
#   @ProgArg     may be pre-filled here beside it's typically used to store
#                return value from &GetoptsNArgs via &AddSwitch (option
#                -fofn)
#   %ProgOpt  switch data which gets processed here
#
# DESCRIPTION
# - this function gets called by &MainLib::Misc::GetoptsNArgs
# - switch arguments are tested for validity. Arguments are parsed with highest
#   possible tolerance. This way, syntax errors can reported in accordance to
#   the actual switch, rather than reporting ANY syntax error.
#
sub AddSwitch {
  my $switch = shift;
  my ($SwitchArg, $pTab);

  # optional switches
  if ($switch =~ m/^debug(=(\d+))?$/i) {
    $ProgOpt{-debug} = defined($2) ? int($2) : 1;
    return;
  }
  if ($switch =~ m/^fofn=(.+)$/i) {
    $SwitchArg = ($1 eq '-') ? $1 : &PathExpand($1);
    if ($pTab = &LoadFoid($SwitchArg)) {
      push @ProgArg, @$pTab;
    } else {
      printf STDERR "ERROR: unable to read entries from file of filenames %s (-> %s)\n",
        $1, $SwitchArg;
      exit 1;
    }
    return;
  }
  if ($switch =~ m/^log(=(.*))?$/i) {
    $ProgOpt{-log} = $2 ? &PathExpand($2) : 1;
    return;
  }
  if ($switch =~ m/^MaskTag=([\w,]+)$/i) {
    $ProgOpt{-MaskTag} = [ split(/,/,$1) ];
    return;
  }
  if ($switch =~ m/^RcReadIndex=(.+)$/i) {
    $ProgOpt{-RcReadIndex} = [ split(',',$1) ];
    return;
  }
  if ($switch =~ m/^Select/i) {
    print  STDERR "ERROR: selector switches are now spelled \"-Slc*\"\n";
    exit 1;
  }
  if ($switch =~ m/^SlcOligo=(.+)$/i) {
    $ProgOpt{-SlcOligo} = $1;
    return;
  }
  if ($switch =~ m/^timer$/i) {
    $ProgOpt{-timer} = 1;
    $ProgParam{time}{start} = time();
    return;
  }
  if ($switch =~ m/^v(erbose)?$/i) {
    $ProgOpt{-verbose} = 1;
    return;
  }

  # program mode switches
  if (defined $ProgMode) {
    printf STDERR "ERROR: multiple specification of program mode or unknown switch, %s and %s\n",
      '-'.($ProgMode||"''"), '-'.($switch||"''");
    exit 1;
  }
  else {
    $ProgMode = $switch;
    return;
  }
}


################################################################################
# analyse/manipulate GAP4 assembly
################################################################################


# export GAP4 contig to AlnK project, perform standard analyses
#
# INTERFACE
# - argument 1: source GAP4 database: (folder/)database.version
# - argument 2: selector for read IDs (regexp)
#               A name of the analysis subfolder and certain files will be
#               derived from this selector statement
#
# - global options:
#   -debug      print debug protocol to STDOUT
#   -OutStump   [STD]
#


# export GAP4 contig to AlnK project, perform standard analyses
#
# INTERFACE
# - argument 1: transposon ID
#               This will be the name of the analysis subfolder in the
#               GAP4 database folder
#
# - global options:
#   -debug      print debug protocol to STDOUT
#   -verbose    print verbose process protocol to STDOUT
#
# DEBUG, CHANGES, ADDITIONS
# - see 'DEBUG 011219'
# - make parts autonomous in respect of cwd and path of target files
#
sub ProgExpTranspos {
  require IPC::Open2; IPC::Open2->import('&open2');
  my (${NameCtg}) = @_;
  my ($debug, $dbg2, $verbose, $bTimer, $time);
  my (%path, %stat, $StatVar, $pTabEnds);
  my (%call, $CallPID, $buffer);

  # function constants
  my $ThreshBlastEnd = 45;
  my %FragCalc = (
    l_O_core => 45,
    l_O_div  => 10,
    l_F      =>  0,
    );

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $verbose   = $ProgOpt{-verbose};
  $debug||$verbose and printf "%s. analysing transposon cluster %s\n", &MySub, ${NameCtg}||"''";
  $bTimer = $ProgOpt{-timer};

  # working paths
  umask (umask() | 0002);
  $path{GapDir}  = $CorePath{ReadWatch}{GapTranspos};
  $path{GapProj} = $ProgParam{path}{GapTranspos};
  $path{ExpAnalDir} = "$path{GapDir}/${NameCtg}";
  $path{AnalEnds} = "$path{GapDir}/${NameCtg}/ends";

  ##############################################################################
  # physical work from GAP4 project to AlnK project

  chdir $CorePath{ReadWatch}{GapTranspos};

  # confirm/create analysis directory
  unless (-d $path{ExpAnalDir}) {
    if (-e $path{ExpAnalDir}) {
      printf STDERR "WARNING: file %s exists - will be overwritten by newly created directory\n", $path{ExpAnalDir}||"''";
      unlink $path{ExpAnalDir};
    }
    printf "creating analysis directory %s\n", $path{ExpAnalDir}||"''";
    unless (mkdir ($path{ExpAnalDir})) {
      printf STDERR "ERROR: unable to create directory %s\n", $path{ExpAnalDir}||"''";
      exit 1;
    }
  }

  # force contig to plus strand
  print  "force contig to plus strand\n";
  $call{cplus} = &CallExtClosure ($CorePath{call}{GapContigPlus}, $path{GapProj}, ${NameCtg});
  unless (&PhysicalAccessCtrl ($path{GapProj}, -mode=>'func', -noLock=>1,
    -func=>$call{cplus}, -log=>1)) {
    printf STDERR "ERROR: unable to modify database %s in %s\n", $path{GapProj}, &PathCwd();
    printf STDERR "  call: %s\n", &{$call{cplus}} ('WhatRUCalling');
    exit 1;
  }
  system (sprintf ("%s %s %s", $CorePath{call}{GapContigReads}, $path{GapProj}, 'fofn'));

  # export target contig from gap to contig folder
  print  "export Experiment files for reads residing in the contig\n";
  $call{export} = &CallExtClosure ($CorePath{call}{GapExportDirect}, $path{GapProj},
    $path{ExpAnalDir}, ${NameCtg}, "> /dev/null");
  unless (&{$call{export}}) {
    print STDERR "ERROR: export of directed assembly from GAP4 database failed\n";
    exit 1;
  }

  # create AlnK project, extract consensus
  print  "create AlnK project, extract consensus\n";
  chdir $path{ExpAnalDir};
  $call{ParamChg} = (${NameCtg} eq 'Tdd-5') ? "param Report::SymbolAnalysis::MaskGapInside  260\n" : '';
  $call{create} = <<END_CALLRPTCONS;
name ${NameCtg}
del ${NameCtg}
$call{ParamChg}save ${NameCtg}.prj
print consensus
dismiss
END_CALLRPTCONS
  unless (&open2 (\*INPROC, \*OUTPROC, "$CorePath{call}{AlnK} fofn")) {
    print STDERR "ERROR: unable to start AlnK process (1)\n";
    exit 1;
  }
  print OUTPROC $call{create};
  &FileCopy (\*INPROC, "${NameCtg}.fa");
  close INPROC;
  close OUTPROC;

  ##############################################################################
  # handle sequence stretch ends

  # sample and report divergent sequence stretch ends
  # do correct remap of original sequences by reload in Align.pl
  print "export divergent ends\n";
  chdir $path{ExpAnalDir};
  $call{ends} = <<END_CALLRPTENDS;
report ends
exit
END_CALLRPTENDS
  unless (open (\*OUTPROC, "| $CorePath{call}{AlnK} ${NameCtg}.prj > ${NameCtg}_ends.log 2>&1")) {
    print  STDERR "ERROR: unable to start AlnK process (2)\n";
    exit 1;
  }
  print  OUTPROC $call{ends};
  close  OUTPROC;

  # analyse divergent sequence ends in sub-directory
  print "analyse divergent sequence ends\n";
  unless (-d $path{AnalEnds}) {
    mkdir ($path{AnalEnds});
  } else {
    unlink glob ("$path{AnalEnds}/*");
  }
  CallOnEnds: {
    foreach ("$CorePath{call}{SeqCat} ${NameCtg}_ends.fa -SlcLen=$ThreshBlastEnd -OutSeq=$path{AnalEnds}/${NameCtg}_ends.fa",
             "$CorePath{call}{BlastWrap} -ListMatch -program=blastn2 -db=dicty.mask -param=SimCluster -MaskPoly=11 -ThreshId=0.890 $path{AnalEnds}/${NameCtg}_ends.fa -OutTab=- > $path{AnalEnds}/${NameCtg}_ends_rept.bll \&",
             "$ProgParam{call}{mask}{rept} $path{AnalEnds}/${NameCtg}_ends.fa > $path{AnalEnds}/${NameCtg}_ends_masked.fa",
#             "$CorePath{call}{BlastWrap} -plain -program=blastn2 -db=dicty.all -param=SimCluster -ValS=280 -OutSuffix=read.bl $path{AnalEnds}/${NameCtg}_ends_masked.fa \&",
             ) {
      $call{onEnds} = &CallExtClosure ($_);
      unless (&{$call{onEnds}}) {
        printf STDERR "ERROR in divergent sequence end procedure. Call was:\n  %s\n",
          &{$call{onEnds}} ('WhatRUCalling');
        last CallOnEnds;
      }
    }

    # do extra analysis for tRNA-associated transposons
    if (${NameCtg} =~ m/^(TRE|DGLT-A)/i) {
      system ("$CorePath{call}{BlastWrap} -ListMatch -db=dicty.trna -param=SimCluster -ThreshId=0.920 $path{AnalEnds}/${NameCtg}_ends.fa -OutTab=- > $path{AnalEnds}/${NameCtg}_ends_tRNA.bll");
      foreach ('A-box1', 'A-box2', 'B-box') {
        my $call = "$CorePath{call}{SeqCat} $path{AnalEnds}/${NameCtg}_ends.fa -OutSeqFmt=table"
          . " | $CorePath{call}{SeqMotif}=tRNA_$_ - "
          . " > $path{AnalEnds}/${NameCtg}_ends_$_.tab";
        system ($call);
        $debug and print  STDERR "calling: $call\n";
      }
      unless (open (\*INPROC, "$CorePath{call}{tRNA} ${NameCtg}_ends.fa |")) {
        print  STDERR "ERROR: unable to start tRNA scan process\n";
        last CallOnEnds;
      }
      &FileCopy (\*INPROC, "$path{AnalEnds}/${NameCtg}_ends_tRNA.txt");
      close INPROC;

      # summarise analysis
      system ("$CorePath{call}{Convert} $path{AnalEnds}/${NameCtg}_ends_A-box1.tab -search=' +' -chg='\\t' | $CorePath{call}{Convert} - -search='(\\d+\\t)(\\d+)' -chg='\${1}A-box1\\t\$2' > $path{AnalEnds}/${NameCtg}_ends_X-box.tab");
      system ("$CorePath{call}{Convert} $path{AnalEnds}/${NameCtg}_ends_A-box2.tab -search=' +' -chg='\\t' | $CorePath{call}{Convert} - -search='(\\d+\\t)(\\d+)' -chg='\${1}A-box2\\t\$2' >> $path{AnalEnds}/${NameCtg}_ends_X-box.tab");
      system ("$CorePath{call}{Convert} $path{AnalEnds}/${NameCtg}_ends_B-box.tab -search=' +' -chg='\\t' | $CorePath{call}{Convert} - -search='(\\d+\\t)(\\d+)' -chg='\${1}B-box\\t\$2' >> $path{AnalEnds}/${NameCtg}_ends_X-box.tab");
      system ("$CorePath{call}{Convert} $path{AnalEnds}/${NameCtg}_ends_tRNA.bll -ColOrder=queryseq_id,queryseq_len | $CorePath{call}{Convert} - -search='^(\\S+(\\t\\S+){1}).*\$' -chg='\$1' >> $path{AnalEnds}/${NameCtg}_ends_X-box.tab");
      system ("$CorePath{call}{Convert} $path{AnalEnds}/${NameCtg}_ends_tRNA.bll -ColOrder=queryseq_id,queryseq_range,matchseq_id,highest_rel_identity | $CorePath{call}{Convert} - -search='^(\\S+(\\t\\S+){3}).*\$' -chg='\$1' >> $path{AnalEnds}/${NameCtg}_ends_X-box.tab");
      system ("$CorePath{call}{Convert} $path{ExpAnalDir}/${NameCtg}_ends.tab -ColOrder='read(s)' | $CorePath{call}{Convert} - -search='\\t.+\$' -chg= | $CorePath{call}{Convert} - -search='^(\\S+) (.+)\$' -chg='\$1\\t\$2' >> $path{AnalEnds}/${NameCtg}_ends_X-box.tab");
    }
  }
  
  exit 0;

  ##############################################################################
  # coverage, polymorphisms & divergence, copy number statistics

  # coverage and polymorphisms
  print "analyse coverage and polymorphisms\n";
  $call{cover} = <<END_CALLRPTDIVERG;
report cover
resolveUnk 0.00
report diverg
dismiss
END_CALLRPTDIVERG
  unless ($CallPID = &open2 (\*INPROC, \*OUTPROC,
    "$CorePath{call}{AlnK} -noDB ${NameCtg}.prj")) {
    print STDERR "ERROR: unable to start AlnK process (3)\n";
    exit 1;
  }
  if (open (OUTREPORT, ">>${NameCtg}_uncertain.tab")) {
    print OUTPROC $call{cover};
    print OUTREPORT "\n";
    &FileCopy (\*INPROC, \*OUTREPORT);
    close INPROC;
    close OUTPROC;
    close OUTREPORT;
  } else {
    print STDERR "ERROR: unable to write to file ${NameCtg}_uncertain.tab\n";
    kill $CallPID;
    exit 1;
  }

  # copy number and fragmentation statistics
  # DESCRIPTION 000702
  # - Grundgesamtheit: zu erwartende Summe an Transposon-Nukleotiden fuer:
  #   genome/JAX4, Chr1/JC1, Chr2/JC2, Chr3/JC3
  # - Die Koordination zwischen AlnK- und GAP4-Projekt erfolgt ueber die
  #   Ressourceneinstellung in &ReadWatch::Cluster::ClusterStat.
  #   genome/JAX4, Chr1/JC1, Chr2/JC2, Chr3/JC3
  # - Stichprobenumfang im GAP4-Projekt (Teilmenge des verfuegbaren Read-Pools):
  #   Schaetzen aus getroffenen Reads. Hochrechnen mittels Trefferquote im
  #   Clustering. Doubletten werden mehrfach gezaehlt - wie auch bei
  #   Cluster-Statistiken. Wie soll man auch Doubletten identifizieren?
  #   Allerdings: Wie ist es mit den Contig-Sequenzen, die jetzt schon z.B.
  #   in DIRS-1 eingearbeitet wurden?
  $path{report} = "${NameCtg}_FragmentStat.log";
  printf "analyse copy number and fragmentation, file %s\n", $path{report};
  open  (OUTREPORT, ">${NameCtg}_FragmentStat.log");
  print  OUTREPORT "statistics for GAP4 contig ${NameCtg}\n";
  &DataPrint ($stat{gap} = &ClusterStat("${NameCtg}.prj",-debug=>$dbg2),
    -handle=>\*OUTREPORT);
    # "${NameCtg}.prj" should represent AlnK file in cwd
  print  OUTREPORT ('-' x 80), "\nstatistics for cluster ${NameCtg}\n";
  # DEBUG 011219: What will happen if the cluster project does not exist?
  # - is &ClusterStat doing an exit?
  # - try contig b36h10
  &DataPrint ($stat{cluster} = &ClusterStat("${NameCtg}",-debug=>$dbg2),
    -handle=>\*OUTREPORT);
  $call{ReadStat} = "$CorePath{call}{SeqStat} '-SlcID=^JAX4.+\\..1\$' -ClipQual -ClipUnk -fofn=fofn";
  $buffer = &ReadFile ("$call{ReadStat} |");
  $buffer =~ m/average sequence length: ([0-9,]+)/;
  $stat{read}{MeanLen} = $1;
  $stat{read}{MeanLen} =~ s/,//g;
  print  OUTREPORT ('-' x 80), "\nstatistics for reads in GAP4 contig ${NameCtg}\n$buffer";
  # DESCRIPTION 000811
  # maximum likelihood esitmation as described in Gloeckner et al. 2000
  # p(H) = (l_R - l_O + max (l_F - l_O, 0)) / l_G
  # realise that there're more than one occurrences (n_F) of the feature:
  # E(n_H) = n_F * (l_R - l_O + max (l_F - l_O, 0)) / l_G
  # ML estimate for n_F
  # ML(n_F) = (n_H * l_G) / n_R / (l_R - l_O + max (l_F - l_O, 0))
  $stat{CalcFrag}{l_G} = &TgtspecifTgtNum ('genome', 'Nt');
  $stat{CalcFrag}{n_H} = &Sum (map { &MatchCt (\($_->{'read(s)'}), 'JAX4\w+\..1\b') }
    @{ $pTabEnds = &PlainToTable ("${NameCtg}_ends.tab", -TabType=>'AH', -comments=>1) });  #"
  $stat{CalcFrag}{n_R} = $stat{cluster}{JAX4}{LibSeq} * $stat{gap}{JAX4}{AlnSeq} / $stat{cluster}{JAX4}{AlnSeq};
  $stat{CalcFrag}{l_F} = $FragCalc{l_F};
  $stat{CalcFrag}{l_R} = $stat{read}{MeanLen};
  $stat{CalcFrag}{l_O} = $FragCalc{l_O_core} + $FragCalc{l_O_div};
  unless ($stat{CalcFrag}{n_R}) {
    print  STDERR "ERROR: unable to get sequence number from alignment statistics\n";
    exit 1;
  }
  $stat{CalcFrag}{formulaval} = $stat{CalcFrag}{formula} =
    'n_F,ML = (n_H * l_G) / n_R / (l_R - l_O + max (l_F - l_O, 0))';
  $stat{CalcFrag}{'n_F,ML'} =
     $stat{CalcFrag}{n_H} * $stat{CalcFrag}{l_G} /
     $stat{CalcFrag}{n_R} /
    ($stat{CalcFrag}{l_R} - $stat{CalcFrag}{l_O}
      + &Max ($stat{CalcFrag}{l_F} - $stat{CalcFrag}{l_O}, 0));
  foreach $StatVar (qw(l_G  n_H  n_R  l_F  l_R  l_O  n_F,ML)) {
    $stat{CalcFrag}{formulaval} =~ s/$StatVar/$stat{CalcFrag}{$StatVar}/eg;
  }
  print  OUTREPORT ('-' x 80), "\n", <<END_PLAINREPORT;
ML estimate for genome-wide border instances (n_F)
$stat{CalcFrag}{formula}
$stat{CalcFrag}{formulaval}
END_PLAINREPORT
  printf OUTREPORT "number of fragments (n_F / 2): %.3f\n",
    $stat{CalcFrag}{'n_F,ML'} / 2;
  printf OUTREPORT "degree of fragmentation: %.3f\n",
    $stat{CalcFrag}{'n_F,ML'} / 2 / $stat{cluster}{JAX4}{GenomeCopyNt};
  # another approach:
  # estimate based on border-instances found in the body of the element
  $stat{CalcFrag}{n_H} = &Sum (map { &MatchCt (\($_->{'read(s)'}), 'JAX4\w+\..1\b') }
    grep { $_->{DivPos} > 10 and $_->{DivPos} < $stat{gap}{sum}{AlnConsLen}-9 }
    @{$pTabEnds});
  $stat{CalcFrag}{formulaval} = $stat{CalcFrag}{formula};
  $stat{CalcFrag}{'n_F,ML'} =
     $stat{CalcFrag}{l_G} * $stat{CalcFrag}{n_H} /
     $stat{CalcFrag}{n_R} /
    ($stat{CalcFrag}{l_R} - $stat{CalcFrag}{l_O}
      + &Max ($stat{CalcFrag}{l_F} - $stat{CalcFrag}{l_O}, 0));
  foreach $StatVar (qw(l_G  n_H  n_R  l_F  l_R  l_O  n_F,ML)) {
    $stat{CalcFrag}{formulaval} =~ s/$StatVar/$stat{CalcFrag}{$StatVar}/eg;
  }
  print  OUTREPORT ('-' x 80), "\n", <<END_PLAINREPORT;
ML estimate for genome-wide border instances found in the body of the element (n_F)
$stat{CalcFrag}{formula}
$stat{CalcFrag}{formulaval}
END_PLAINREPORT
  printf OUTREPORT "degree of fragmentation: %.3f\n",
    1 + ($stat{CalcFrag}{'n_F,ML'} / 2 / $stat{cluster}{JAX4}{GenomeCopyNt});
  close OUTREPORT;

  ##############################################################################
  # ORF analysis

  # ORF analysis
  print "analyse ORFs\n";
  $call{OrfBase} = join (' ', $CorePath{call}{SeqOrf}, '-OutImg', "${NameCtg}.fa", "-SlcLen=45");
  $call{orf} = &CallExtClosure ($call{OrfBase},
    "-OutTab=${NameCtg}_orf6.tab", "-OutImg=${NameCtg}_orf6.png");
  unless (&{$call{orf}}) {
    print STDERR "ERROR: unable to start SeqHandle process (ORF 6f). Call was:\n%s\n",
      &{$call{orf}} ('WhatRUCalling');
    exit 1;
  }
  $call{orf} = &CallExtClosure ($call{OrfBase}, "-SlcFrame=plus",
    "-OutTab=/dev/null", "-OutImg=${NameCtg}_orf3.png");
  unless (&{$call{orf}}) {
    print STDERR "ERROR: unable to start SeqHandle process (ORF 3f). Call was:\n%s\n",
      &{$call{orf}} ('WhatRUCalling');
    exit 1;
  }

  # the end
  print "done\n";
}


# change symbolic links in GAP4 project folder
#
# INTERFACE
# - argument 1*: GAP4 project folder
#
sub ProgScfNice {
  my ($GapDir) = @_;
  my ($PathEntry, $target, $targetnew);
  my ($TgtExper);

  # function parameters
  $GapDir ||= './';
  $GapDir = &PathExpand ($GapDir);
  printf "%s. working on project folder %s\n", &MySub, $GapDir||"''";

  # loop over link entries
  foreach (&ReadDir ($GapDir, -select=>'SCF$')) {
    $PathEntry = "$GapDir/$_";
    unless (-l $PathEntry) {
      printf STDERR "%s. WARNING: entry $_ is not a link\n", &MySub;
      next;
    };
    $targetnew = $target = readlink ($PathEntry);

    # change link syntax
    $targetnew =~ s#^/gen/(emu|vulpix)/pro/Dictyostelium/#/gen/fox/pro/Dictyostelium/#;
    $targetnew =~ s#^../../#/gen/fox/pro/Dictyostelium/#;
    $targetnew =~ s#//+#/#;
    $targetnew =~ s#fox/pro/(AX4|Chr|II[AC]|PCR|sdi|transpos)#fox/pro/Dictyostelium/$1#;
    if ($target ne $targetnew) {
      print "$target => $targetnew\n";
      unlink $PathEntry;
      symlink ($targetnew, $PathEntry);
    }
  }
}


# restore broken SCF links in a GAP4 project folder
#
# INTERFACE
# - global options:
#   -debug       [STD]
#   -timer       [STD]
#   -OutStump    ...
#
# DESCRIPTION
# - read data structure, hash:
#   xxx        hash of Libtgt-sorted hints from broken SCF links
#
# - hint data structure, hash:
#   ByLibtgt   hash of Libtgt-sorted hints from broken SCF links
#   ByRecov    hints added according to successful recovery of SCFs
#   all        all current hints on:
#              grp   group ID
#              prj   project ID
#   queue      array of hints (= full search paths) used for recovery
#   QueueInit  init entry hint queue
#   QueueOld   array of hints (= full search paths) used for last recovery
#
sub ProgScfCure {
  my ($bMe, $debug, $dbg2, $bTimer, $time);
  my (%prj, @Gapdb, $ItGapdb, $bRenScf);
  my (@scf, $CtMiss, $ItScfMiss, $pScfMiss, %ScfMiss);
  my (%hint, $ItLibtgt, $ItGrp, $ItPrj, $ItMch);
  my ($ItHint, $bAddHint, $CtRecov);

  # function constants: regexp definitions
  my $GetGrpPrj = '\bpro\/([\w.-]+)\/([\w.-]+)';
  my $GetMch = '\b([0-9][0-9].[0-9][0-9].[0-9][0-9]_[^\/]*)';

  # function parameters
  $bMe = ((getlogin()||getpwuid($<)) eq 'szafrans') ? 1 : 0;
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $bTimer = $ProgOpt{-timer};

  # project path and according informative fields
  $prj{path} = &PathCwd();
  if ($prj{path} =~ m/$GetGrpPrj/o) {
    $prj{grp} = $1;
    $prj{prj} = $2;
  }
  if ($debug) {
    printf STDERR "%s. project characteristics:\n", &MySub;
    printf STDERR "  physical path: %s\n", $prj{path};
    printf STDERR "  group: %s\n", $prj{grp};
    printf STDERR "  project: %s\n", $prj{prj};
  }
  @Gapdb = grep { -e "$_.aux" } glob ('*.0');
  unless (int @Gapdb) {
    die sprintf "ERROR: did not find any GAP4 database in current directory\n";
  }
  foreach $ItGapdb (grep { -e "$_.BUSY" } @Gapdb) {
    die sprintf "ERROR: GAP4 database %s is busy. Please, close it and start me again.\n", $ItGapdb;
  }

  ##############################################################################
  # physical GAP4 DB restoration

  # loop over GAP4 DBs
  printf "\nperforming physical restoration of GAP4 DBs\n";
  foreach $ItGapdb (@Gapdb) {
    my $buffer = &ReadFile ("$CorePath{call}{GapScript}/readtracecure.tcl $ItGapdb &|");
    printf "%s - %s", $ItGapdb, $buffer;
    $bRenScf ||= ($buffer=~m/(\d+) +change/) ? $1 : '';
  }

  # have to rename *.scf.gz files
  if ($bRenScf) {
    printf "\nrenaming *.scf.gz files\n";
    $CtMiss = 0;
    foreach $ItScfMiss (glob ('*.scf.gz')) {
      $CtMiss ++;
      my $buffer = $ItScfMiss;
      $buffer =~ s/\.scf\.gz$/SCF/;
      &mv ($ItScfMiss, $buffer);
    }
    printf "%d file%s/link%s renamed", $CtMiss, (($CtMiss==1) ? '':'s') x 2;
  }

  ##############################################################################
  # sample lost links

  # loop over GAP4 DBs
  $bTimer and $time = &Sum ((times)[0,2]);
  printf "\ninspecting GAP4 DBs for required SCFs\n";
  $CtMiss = 0;
  foreach $ItGapdb (@Gapdb) {
    @scf = @{ &LoadFoid ("$CorePath{call}{GapScript}/listscfs.tcl $ItGapdb |", -debug=>$dbg2) };
    printf "%s - %d SCF pointer%s\n", $ItGapdb, int(@scf), (@scf==1)?'':'s';

    # loop over lost SCF links
    foreach $ItScfMiss (grep { ! -e $_ } @scf) {

      # enter broken link into list
      $pScfMiss = &ScfCureInfo ($ItScfMiss);
      $ScfMiss{$$pScfMiss{NameRead}} = $pScfMiss;
      $CtMiss ++;

      # no hints, but add sequencing target into hint data structure
      #  (later as an iterator for restoration_
      $hint{ByLibtgt}{$$pScfMiss{NameField}{tgt}} ||= {};
    }
  }
  $bTimer and printf STDERR "%s. CPU time to check SCF existence\n", &MySub, &Sum((times)[0,2])-$time;
  printf "total of %d lost SCF%s\n", $CtMiss, ($CtMiss==1) ? '':'s';

  ##############################################################################
  # sample broken links

  # read SCF links
  $bTimer and $time = &Sum ((times)[0,2]);
  @scf = &ReadDir ('./', -select=>'SCF$');
  if ($bTimer) {
    printf STDERR "%s. CPU time for directory listing %.3f\n", &MySub, &Sum((times)[0,2])-$time;
    $time = &Sum ((times)[0,2]);
  }
  printf "\nchecking existing SCF links (total: %d)\n", int @scf;

  # loop over invalid SCF links
  $CtMiss = 0;
  foreach $ItScfMiss (@scf) {
    if (-e $ItScfMiss) { next }

    # enter broken link into list
    $pScfMiss = &ScfCureInfo ($ItScfMiss);
    $ScfMiss{$$pScfMiss{NameRead}} = $pScfMiss;
    $CtMiss ++;

    # determine group/project for existing link
    if ($$pScfMiss{ScfLink}{TgtOldFile} = readlink $$pScfMiss{NameScf}) {
      if ($$pScfMiss{ScfLink}{TgtOldFile} =~ m/$GetGrpPrj/o) {
        $$pScfMiss{ScfLink}{TgtOldGrp} = $1;
        $$pScfMiss{ScfLink}{TgtOldPrj} = $2;
      } elsif ($debug) {
        printf STDERR "%s. regexp failure for old link target: %s\n", &MySub,
          $$pScfMiss{ScfLink}{TgtOldFile};
      }
      if ($$pScfMiss{ScfLink}{TgtOldFile} =~ m/$GetMch/o) {
        $$pScfMiss{ScfLink}{TgtOldMch} = $1;
      }
      $debug and printf STDERR "%s. group %s, project %s, machine %s\n", &MySub,
        $$pScfMiss{ScfLink}{TgtOldGrp}||"''", $$pScfMiss{ScfLink}{TgtOldPrj}||"''",
        $$pScfMiss{ScfLink}{TgtOldMch}||"''";
    } else {
      printf STDERR "WARNING: don't get target for SCF link $$pScfMiss{NameScf}\n";
    }

    # collect hints for correct linking
    # - organize according to sequencing target
    if ($$pScfMiss{ScfLink}{TgtOldGrp} and $$pScfMiss{ScfLink}{TgtOldPrj}) {
      $hint{ByLibtgt}{$$pScfMiss{NameField}{tgt}}{TgtOldGrp} ||= $$pScfMiss{ScfLink}{TgtOldGrp};
      $hint{ByLibtgt}{$$pScfMiss{NameField}{tgt}}{TgtOldPrj} ||= $$pScfMiss{ScfLink}{TgtOldPrj};
    } else {
      $debug and printf STDERR "%s. target of $$pScfMiss{NameScf} doesn't yield group/project information\n", &MySub;
      $hint{ByLibtgt}{$$pScfMiss{NameField}{tgt}} ||= {};
    }
  }
  $bTimer and printf STDERR "%s. CPU time for SCF link validation\n", &MySub, &Sum((times)[0,2])-$time;
  printf "%d broken SCF link%s\n", $CtMiss, ($CtMiss==1) ? '':'s';

  # meanwhile report
  printf "\ntotal of %d lost/broken SCF link%s, corresponding to %d sequencing target%s\n",
    int keys %ScfMiss, (int(keys(%ScfMiss)) == 1) ? '':'s',
    int keys %{$hint{ByLibtgt}}, (int(keys(%{$hint{ByLibtgt}})) == 1) ? '':'s';
  unless (int (keys %ScfMiss)) { exit }
  foreach $ItLibtgt (keys %{$hint{ByLibtgt}}) {
    printf "  %s: %d\n", $ItLibtgt, int (grep { $_->{NameField}{tgt} eq $ItLibtgt } values %ScfMiss);
  }

  ##############################################################################
  # search SCFs according to hints
  # - construct queue of search paths from hints
  # - look for matching SCFs in search paths
  # - restore/install links

  # loop over sequencing targets
  $time = &Sum ((times)[0,2]);
  $hint{QueueInit} = { failed => 0.5 };
  foreach $ItLibtgt (keys %{$hint{ByLibtgt}}) {
    printf "\nSCF look-up - sequencing target %s, %d broken/missing SCF link%s\n", $ItLibtgt||"''",
      int (grep { $_->{NameField}{tgt} eq $ItLibtgt } values %ScfMiss), (int(keys(%ScfMiss)) == 1) ? '':'s';
    delete $hint{ByRecov};

    CureRestLibtgt: {
      # construct queue of hints (= SCF search paths) from hints
      # - hints are ordered according to:
      #   - their likelihood for match
      #   - their approximate search volume (jokers last!)
      @{$hint{QueueOld}} = @{$hint{queue}}; @{$hint{queue}} = ();
      @{$hint{all}{grp}} = &unique (grep { $_ }
        @{$hint{ByRecov}{grp}}, $prj{grp}, $hint{ByLibtgt}{$ItLibtgt}{TgtOldGrp}, '*');
      @{$hint{all}{prj}} = &unique (grep { $_ }
        @{$hint{ByRecov}{prj}}, uc $ItLibtgt, lc $ItLibtgt, $prj{prj}, $hint{ByLibtgt}{$ItLibtgt}{TgtOldPrj});
      # loop in reverse order on hint sources for group / project
      # note tricky usage of unshift and push!
      foreach $ItGrp (reverse @{$hint{all}{grp}}) {
        foreach $ItPrj (reverse @{$hint{all}{prj}}) {
          $ItHint = "/gen/fox/pro/$ItGrp/$ItPrj";
          if ($ItGrp ne '*' and ! -d $ItHint) { next }
          unshift @{$hint{queue}}, { path=>$ItHint, %{$hint{QueueInit}} };
        }
        if ($ItGrp ne '*') {
          push @{$hint{queue}}, { path=>"/gen/fox/pro/$ItGrp/*", %{$hint{QueueInit}} };
        }
      }
      $debug and &DataPrint ($hint{queue}, -handle=>\*STDERR);

      # loop over SCFs corresponding to sequencing target
      CureRestScf:
      foreach $pScfMiss (grep { $_->{NameField}{tgt} eq $ItLibtgt } values %ScfMiss) {
        printf "trying to recover SCF %s\n", $$pScfMiss{NameScf};

        # loop over hint paths
        foreach $ItHint (@{$hint{queue}}) {
          $debug and printf STDERR "%s. following hint path: %s\n", &MySub, $$ItHint{path}||"''";

          # look for first file matching in project path
          $$pScfMiss{ScfLink}{TgtNewFile}
            = (glob ("$$ItHint{path}/[0-9][0-9].[0-9][0-9].[0-9][0-9]_*/$$pScfMiss{NameScf}*"))[0];

          # found SCF
          # - eventually unzip files
          # - restore SCF link
          if ($$pScfMiss{ScfLink}{TgtNewFile}) {
            printf "found %s\n", $$pScfMiss{ScfLink}{TgtNewFile};
            if ($$pScfMiss{ScfLink}{TgtNewFile} =~ m/\.gz$/) {
              system "gunzip $$pScfMiss{ScfLink}{TgtNewFile} >& /dev/null";
              $$pScfMiss{ScfLink}{TgtNewFile} =~ s/\.gz$//;
              unless (-e $$pScfMiss{ScfLink}{TgtNewFile}) { next }
            }

            # restore SCF link
            unless (&FileLink ($$pScfMiss{ScfLink}{TgtNewFile}, $$pScfMiss{NameScf})) {
              printf STDERR "ERROR: unable to set link %s -> %s\n",
                $$pScfMiss{NameScf}, $$pScfMiss{ScfLink}{TgtNewFile};
              exit 1;  # avoid bulk loops for these fatal errors
            }
            unless (-e $$pScfMiss{NameScf}) {
              printf STDERR "ERROR: link %s -> %s was set, but it's not there!\n",
                $$pScfMiss{NameScf}, $$pScfMiss{ScfLink}{TgtNewFile};
              exit 1;  # avoid bulk loops for these fatal errors
            }
            delete $ScfMiss{$$pScfMiss{NameRead}};
            $CtRecov ++;

            # eventually expand hint information
            if ($$pScfMiss{ScfLink}{TgtNewFile} =~ m/$GetGrpPrj/o) {
              $bAddHint = 0;
              if (grep { $_ eq $1 } @{$hint{ByRecov}{grp}}) {
                push @{$hint{ByRecov}{grp}}, $1;
                $debug||$bMe and printf STDERR "%s. got additional hint from recover of SCF: group %s\n", &MySub, $1;
                $bAddHint = 1;
              }
              if (grep { $_ eq $2 } @{$hint{ByRecov}{prj}}) {
                push @{$hint{ByRecov}{prj}}, $2;
                $debug||$bMe and printf STDERR "%s. got additional hint from recover of SCF: project %s\n", &MySub, $2;
                $bAddHint = 1;
              }
              if ($bAddHint) {
                redo CureRestLibtgt;
              }
            }

            # recovery successful, turn on next SCF
            next CureRestScf;
          }
        }
      }
    }  # end: loop with current hint set

    # still broken SCFs?
    # xxx
  }

  # summary
  print  "\nSUMMARY\n";
  printf "recovered %d SCF%s, CPU time[s] %.1f\n", $CtRecov,
    (int($CtRecov)==1) ? '':'s', &Sum((times)[0,2])-$time;
  printf "SCFs still missing/broken: %s\n", int(keys(%ScfMiss)) ? '':'NONE';
  if (int (keys %ScfMiss)) {
    foreach $ItLibtgt (keys %{$hint{ByLibtgt}}) {
      printf "  %s: %d\n", $ItLibtgt, int (grep { $_->{NameField}{tgt} eq $ItLibtgt } values %ScfMiss);
    }
  }
}


# SCF info structure from SCF file name
#
# INTERFACE
# - argument 1: SCF name
#
# - options:
#   -debug      [STD]
#
# - return val: reference to SCF info structure:
#
sub ScfCureInfo {
  my ($ScfName, %opt) = @_;
  my ($debug);
  my ($pScfInfo);

  # function parameters
  $debug = $opt{-debug};

  # start constructing information data
  $pScfInfo = {};
  $$pScfInfo{NameScf} = $ScfName;

  # construct read information
  $$pScfInfo{NameRead} = $$pScfInfo{NameScf};
  $$pScfInfo{NameRead} =~ s/SCF$//;
  unless ($$pScfInfo{NameField} = &ReadidToFields($$pScfInfo{NameRead},-clone=>1)) {
    printf STDERR "WARNING: read %s doesn't fit to GSCJ naming scheme - won't be able to restore SCF\n",
      $$pScfInfo{NameRead}||"''";
    next;
  }
  $debug and printf STDERR "%s. read %s corresponds to sequencing target %s\n", &MySub,
    $$pScfInfo{NameRead}, $$pScfInfo{NameField}{tgt};

  # exit SUB
  return $pScfInfo;
}


################################################################################
# perform assembly
################################################################################


# guess manual assembly recipee for primer-walk sequences
#
# INTERFACE
# - argument 1:   path of report produced by script action -ListOligo
#                 that was used to prepare custom primer and extra sequences
# - argument 2+:  GAP4 database(s): (folder/)database.version
#
# - global options
#   -debug        [STD]
#   -SlcOligo
#   -timer        print time performance protocol to STDOUT
#
# DESCRIPTION
# - internal data structures
#   %primer              primer information from report text (from arg1)
#   $primer{RepPlain}  plain primer report text (from arg1)
#   $primer{index}     primer ID index on oligo and template information
#   %oligo               oligo information from GAP4 database
#   $oligo{list}       xxx
#   $oligo{index}      xxx
#   $oligo{ContigIdx}  xxx
#   $read{list}        list of reads, finally reads to be assembled into
#                        current GAP4 project
#   $read{index}       index of reads
#
sub ProgSeqprimEnter {
  my ($PathOligoRep, @PathProj) = @_;
  my ($debug, $dbg2, $bTimer, $SlcOligo);
  my (%gap, $ProjLast, $pCtgStruct);
  my (%primer, $pPrimer, $ItPrimer, $ItTempl, @TemplPlus, $pAnnot, $pOligo, %oligo);
  my ($ItContig, @ContigMisorient, %read, $ReadPresent);
  my (%file, $sCall, $pCall);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $bTimer = $ProgOpt{-timer};
  $SlcOligo = $ProgOpt{-SlcOligo};

  print  "$ProgFile -$ProgMode\n";

  ##############################################################################
  # get sequencing information

  # read oligo/sequence listing
  $primer{RepPlain} = &ReadFile ($PathOligoRep);
  if ($primer{RepPlain} =~ m/SEQUENCING\n(.+)$/s) {
    $primer{RepPlain} = $1;
    if ($primer{RepPlain} =~ m/\n{2,}/) { $primer{RepPlain} = $` . $&; }
    $primer{RepTab} = &PlainToTable (\$primer{RepPlain},
      -TabType  => 'AH',
      -ColLabel => [ 'template', 'primer' ],
      -delimit  => { line=>$reEndl, col=>'\s+' },
      );
  } elsif (int (grep { m/(\t| +)/ } split (/\n/, $primer{RepPlain})) > 5) {
    $primer{RepTab} = &PlainToTable ($PathOligoRep,
      -TabType  => 'AH',
      -ColLabel => [ 'template', 'primer' ],
      -delimit  => { line=>$reEndl, col=>'\s+' },
      );
  } else {
    printf STDERR "ERROR: no primer sequencing list found in file %s\n", $PathOligoRep;
    exit 1;
  }

  # non-redundant primer data in %{$primer{index}}
  $primer{index} = {};
  foreach $pPrimer (@{$primer{RepTab}}) {
    if (exists $primer{index}{$$pPrimer{primer}}) {
      push @{$primer{index}{$$pPrimer{primer}}{TemplArr}}, $$pPrimer{template};
    } else {
      $primer{index}{$$pPrimer{primer}} = $pPrimer;
      $$pPrimer{TemplArr} = [ $$pPrimer{template} ];
    }
    delete $$pPrimer{template};
  }
  if ($debug) {
    printf STDERR "%s. primers found in SeqPrimer file %s:\n", &MySub, $PathOligoRep;
    &DataPrint ($primer{index}, -handle=>\*STDERR, space=>2);
  }

  ##############################################################################
  # get GAP4 database information, Experiment items in GAP4 folder

  # loop over GAP4 databases
  SeqPrimerEnterProj:
  foreach my $ItProj (@PathProj) {
    printf "analysis of database %s\n", $ItProj||"''";
    %oligo = ();

    %gap = %{ &GapNameFields($ItProj) };
    if ($gap{exists}) {
      $debug and printf STDERR "%s. working on database %s\n", &MySub, $gap{full}||"''";
    } else {
      printf STDERR "%s. unable to read GAP4 database %s\n", &MySub, $gap{full}||"''";
      next;
    }

    # find Experiment files in GAP4 directory which correspond to listed templates
    # => @{$primer{index}{$primer}{TemplTransl}}
    if (! $ProjLast or $gap{dir} ne &GapNameFields($ProjLast)->{dir}) {
      foreach $pPrimer (values %{$primer{index}}) {
        delete $$pPrimer{TemplTransl};
        foreach $ItTempl (@{$$pPrimer{TemplArr}}) {
          if ($ItTempl =~ m/\.[p-t]\d$/) {
            @TemplPlus = &ReadDir ($gap{dir}||'./', -select=>'^'.&RegexpEncode($ItTempl).'$');
          } else {
            $ItPrimer = $$pPrimer{primer};
            @TemplPlus = &ReadDir ($gap{dir}||'./', -select=>
              ($ItPrimer =~ m/^[rs]$/) ? "^$ItTempl\\.$ItPrimer\[2-9\]\$" : "^$ItTempl\\.p\\d\$");
          }
          @TemplPlus or @TemplPlus = ("($ItTempl)");
          $$pPrimer{TemplTransl} ||= [];
          push @{$$pPrimer{TemplTransl}}, @TemplPlus;
        }
        if ($$pPrimer{TemplTransl} !~ m/^\(/) {
          @{$$pPrimer{TemplTransl}} = sort @{$$pPrimer{TemplTransl}};
          $debug and printf STDERR "%s. template translation:\n  %s => %s\n", &MySub,
            join (' ', @{$$pPrimer{TemplArr}}), join (' ', @{$$pPrimer{TemplTransl}});
        }
      }
    }

  ##############################################################################
  # get information from GAP4 database

    # redo block for contig reorientation
    $ProjLast = $ItProj;
    SeqPrimerEnterOrient: {

      # read contig source into data structure
      &DataDecross ($pCtgStruct);
      $pCtgStruct = &ContigStructMeta ([ $ItProj ],
        -annotation => 1,
        -timer      => $bTimer,
        -debug      => $dbg2);
      # summary debug of contig set data structure possible in &SeqAlign::Assembly::ContigStruct

      # loop over oligos
      $oligo{list} = [];
      foreach $pAnnot (grep { $_->{type} eq 'OLIG' } @{$$pCtgStruct{annotation}}) {

        # parse oligo fields from annotation text
        $$pAnnot{text} =~ s/serial#/ID/g;
        $pOligo = {};
        while ($$pAnnot{text} =~ m/^(ID|template|PairWith|PcrPair|sequence)=(.+)$/img) {
          $$pOligo{lc $1} = $2;
        }
        unless ($$pOligo{id} and $$pOligo{sequence}) { next };
        if ($SlcOligo and $$pOligo{id} !~ m/$SlcOligo/o) { next };
        $debug and printf STDERR "%s. entered oligo %s\n", &MySub, $$pOligo{id};

        # enter annotation fields
        $$pOligo{contig} = $$pAnnot{contig}{id};
        $$pOligo{read} = $$pAnnot{read}{id};
        $$pOligo{orient} = $$pAnnot{corient};
        $$pOligo{offset} = ($$pAnnot{corient} > 0) ?
          $$pAnnot{cpos} : ($$pAnnot{cpos} + $$pAnnot{length} - 1);
        $debug and printf STDERR "%s. %d reads in oligo's contig (%s)\n", &MySub,
          int keys %{$$pCtgStruct{contig}{$$pOligo{contig}}{read}}, $$pOligo{contig}{id};
        $$pOligo{readnear} = ((
          sort { abs ($a->{pos}{-1}-($$pOligo{offset}+40)) <=> abs ($b->{pos}{-1}-($$pOligo{offset}+40)) }
          values %{$$pCtgStruct{contig}{$$pOligo{contig}}{read}}
          )[0] || $$pOligo{contig})->{id};
        $$pOligo{end} = ($$pAnnot{corient} > 0) ?
          ($$pAnnot{cpos} + $$pAnnot{length} - 1) : $$pAnnot{cpos};
        if ($$pOligo{template}) {
          $$pOligo{template} = [ split (',', $$pOligo{template}) ];
        }

        # enter into oligo list
        push @{$oligo{list}}, $pOligo;
      }

      # oligo index
      $oligo{index} = { map { ($_->{id} => $_) } @{$oligo{list}} };

  ##############################################################################
  # cross-match between primer list and oligos in database
  # order contigs

      # loop over matching primers
      $primer{MatchIdx} = {
        map { ($_->{primer} => $_) }
        grep { ! exists $oligo{DoneIdx} or ! $oligo{DoneIdx}{$_->{primer}} }
        grep { exists $oligo{index}{$_->{primer}} }
        values %{$primer{index}} };
      # next database if there's no primer match
      unless (%{$primer{MatchIdx}}) {
        printf "no primer match found in database %s\n", $ItProj||"''";
        next SeqPrimerEnterProj;
      }

      # enter matching templates for r/s primers
      # ...

      # enter new and available Experiment files into GAP4 database
      %{$read{index}} = map { ($_=>1) }
        grep { ! $$pCtgStruct{read}{$_} }
        grep { m/^\w/ } map { @{$_||[]} }
        &DataTreeSlc ($primer{MatchIdx}, [[0,'all'],['TemplTransl'],[0,'all']]);
      @{$read{list}} = keys %{$read{index}};
      if ($debug) {
        printf STDERR "%s. assembling reads:\n", &MySub;
        print  STDERR '', map {"  $_\n"} @{$read{list}};
      }
      $file{assemble} = $ProgParam{TmpManag}->Create();
      WriteFile ($file{assemble}, join('', map {"$_\n"} @{$read{list}}));
      $pCall = &CallExtClosure ($CorePath{call}{GapAssembNew}, $gap{full}, $file{assemble});
      unless (&PhysicalAccessCtrl ($gap{full}, -mode=>'func', -noLock=>1,
        -func=>$pCall, -log=>1)) {
        print STDERR "ERROR in assembling new reads into database. It may be corrupt now\n";
        exit 1;
      }

      # sort oligos into matching contigs
      $oligo{ContigIdx} = {};
      foreach $pPrimer (values %{$primer{MatchIdx}}) {
        $oligo{ContigIdx}{$oligo{index}{$pPrimer->{primer}}{contig}} ||= { id => $oligo{index}{$pPrimer->{primer}}{contig} };
        push @{$oligo{ContigIdx}{$oligo{index}{$pPrimer->{primer}}{contig}}{oligo}},
          $oligo{index}{$pPrimer->{primer}};
      }

      # analyse direction of contigs/oligos
      foreach $ItContig (values %{$oligo{ContigIdx}}) {
        $$ItContig{orient} = { map { ($_->{orient}=>1) } @{$$ItContig{oligo}} };
      }
      if ($debug) {
        printf STDERR "%s. matching primers:\n", &MySub;
        &DataPrint ($primer{MatchIdx}, -handle=>\*STDERR);
        printf STDERR "%s. matching contigs:\n", &MySub;
        &DataPrint ($oligo{ContigIdx}, -handle=>\*STDERR);
      }

      # reverse-complement misoriented contigs, redo
      if (@ContigMisorient =
          grep { (exists $_->{orient}{-1}) and not (exists $_->{orient}{1}) }
          values %{$oligo{ContigIdx}}) {
        $sCall = join (' ', $CorePath{call}{GapContigRevcompl}, $ItProj,
          (map { $_->{id} } @ContigMisorient));
        $debug and printf STDERR "%s. complementing contigs, call:\n  %s\n", &MySub, $sCall;
        if (&GapSafeCall ($sCall)) {
          printf STDERR "ERROR: complementing contigs failed, call:\n  %s\n", $sCall;
          exit 1;
        }
        redo SeqPrimerEnterOrient;
      }

  ##############################################################################
  # recipee report and database actions

      # locate positively oriented oligos
      $oligo{plus} = [ sort { $a->{contig} cmp $b->{contig} or $a->{id} cmp $b->{id} }
        grep { exists($_->{orient}) and $_->{orient}==1 }
        @{ &DataTreeSlc ($oligo{ContigIdx}, [[0,'all'],['oligo'],[0,'all']]) } ];
      if (@{$oligo{plus}}) {

        # recipee
        # - list all cases of positively oriented primers
        # - list according contig, primer, nearest read offset, templates
        $file{recipee} = $ProgParam{TmpManag}->Create();
        open (OUTRECIP, ">$file{recipee}");
        print  OUTRECIP "# contig\tprimer\tread_near\tread_new\n";
        foreach (sort { $a->{id} cmp $b->{id} } @{$oligo{plus}}) {
          printf OUTRECIP "%s\t%s\t%s\t%s\n", $_->{contig}, $_->{id}, $_->{readnear},
            join (' ', @{$primer{index}{$_->{id}}{TemplTransl}});
        }
        # this is a preliminary approach to enter extra r/s reads
        # - the orientation of contigs is not regarded
        $debug and &DataPrint ($primer{index}, -handle=>\*STDERR);
        foreach $ItPrimer (qw(r s)) {
          if (exists $primer{index}{$ItPrimer}) {
            foreach my $ItRead (@{$primer{index}{$ItPrimer}{TemplTransl}}) {
              if (exists $$pCtgStruct{clone}{(&ReadidToFields($ItRead)||{})->{cln}} and
                  keys %{$$pCtgStruct{read}{$ItRead}{contig}{read}} <= 1) {
                $ReadPresent = ((grep { exists $_->{id} }
                  values %{$$pCtgStruct{clone}{&ReadidToFields($ItRead)->{cln}}{$ItPrimer}})[0]||{})->{id};
                printf OUTRECIP "%s\t%s\t%s\t%s\n",
                  $$pCtgStruct{read}{$ReadPresent}{contig}{id},
                  $ItPrimer, $ReadPresent, $ItRead;
              }
            }
          }
        }
        close OUTRECIP;
        system "$CorePath{call}{edit} $file{recipee} & ";

        # invoke GAP4
        $pCall = &CallExtClosure ('cd', $gap{dir} || './', ';',
          $CorePath{call}{gap}, $gap{full});
        unless (&PhysicalAccessCtrl ($gap{full}, -mode=>'func', -noLock=>1,
          -func=>$pCall, -log=>1)) {
          print STDERR "ERROR while opening database with GAP4. It may be corrupt now\n";
          exit 1;
        }

        # reverse-complement contigs with both orientations for oligos, redo
        if (@ContigMisorient =
            grep { exists $_->{orient}{-1} and exists $_->{orient}{1} }
            values %{$oligo{ContigIdx}}) {

          # don't regard these oligos anymore
          $oligo{DoneIdx} = { map { $_->{id} => $_ } @{$oligo{plus}} };

          # complement contigs
          $sCall = join (' ', $CorePath{call}{GapContigRevcompl}, $ItProj,
            (map { $_->{id} } @ContigMisorient));
          $debug and printf STDERR "%s. complementing contigs, call:\n  %s\n", &MySub, $sCall;
          if (&GapSafeCall ($sCall)) {
            printf STDERR "ERROR: complementing contigs failed, call:\n  %s\n", $sCall;
            exit 1;
          }
          redo SeqPrimerEnterOrient;
        }
      }

      elsif (! %{$oligo{DoneIdx}}) {
        printf "no oligo matches found in database %s\n", $ItProj;
        next SeqPrimerEnterProj;
      }

    } # SeqPrimerEnterOrient
  } # SeqPrimerEnterProj

  # tidy up
  $debug or unlink (grep{-e $_} $file{assemble}, $file{recipee});
}


# perform cyclic procedure of "Gap.pl -AddRev ..." and "GapAddBlast.sh ..."
# *** out of curation ***
#
# INTERFACE
# - argument 1: GAP4 database: (folder/)database.version
#
sub ProgAssembCplx {
  my ($PathProj) = @_;
  my (%gap, $debug, $dbg2, %slc, $OutStump, $bTimer, $time, $ThreshAddNum);
  my (%path, $sCall, $pCall, $ret);
  my ($CtCycle, $CtAdded);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $bTimer = $ProgOpt{-timer};
  $ThreshAddNum = $ProgOpt{-ThreshAddNum} || $ProgParam{default}{ThreshAddNum};

  print  "$ProgFile -$ProgMode\n";
  %gap = %{ &GapNameFields ($PathProj) };
  unless (-w $gap{full}) {
    printf STDERR "ERROR: unable to read/write GAP4 database %s", $gap{full};
    exit 1;
  }
  printf "GAP4 database: %s%s.%s\n", $gap{dir}, $gap{name}, $gap{version};

  # OutStump, self-logging
  $OutStump = &TimeStr (-format=>'CompactComp');
  unless ($ProgOpt{-log}) {
    &LogOpen (-file=>"$gap{dir}${OutStump}_${ProgMode}.log", -prog=>"$ProgFile -$ProgMode");
  }

  # delete singles
  $path{FofnSingle} = "$gap{dir}${OutStump}_single.foid";
  $sCall = join (' ', $CorePath{call}{GapReadSingle}, $gap{full},
    ">$path{FofnSingle}");
  if (&GapSafeCall ($sCall, -debug=>$dbg2)) {
    print STDERR "ERROR in getting list of singlet contigs\n";
    exit 1;
  }
  if (-s $path{FofnSingle}) {
    printf "\ndeleting singles: %d\n", &wc_l ($path{FofnSingle});
    $pCall = &CallExtClosure ($CorePath{call}{GapReadDel}, $gap{full},
      $path{FofnSingle}, '>/dev/null');
    unless (&PhysicalAccessCtrl ($gap{full}, -mode=>'func', -noLock=>1,
      -mail => { ProcStamp=>"$ProgFile -$ProgMode" },
      -func=>$pCall, -log=>1)) {
      print STDERR "ERROR in deleting files in target database. It may be corrupt now, call:\n  %s\n", &$pCall ('WhatRUCalling');
      exit 1;
    }
  }

  # subprocess Gap.pl -AddBlast over the whole contigs
  print "\nBLAST / ASSEMBLY of whole large contigs\n";
  $slc{len}  = 1000;
  $slc{ends} = 0;
  $OutStump  = &TimeStr (-format=>'CompactComp');
  $OutStump .= '_AddBlast';
  $path{FoidGot} = "$gap{dir}${OutStump}_got.foid";
  $sCall = "$CorePath{call}{GapAddBlast} -SlcLen=$slc{len} -SlcEnds=$slc{ends} -OutStump=$OutStump $gap{full}";
  $debug and printf STDERR "%s. calling process to find BLAST hits:\n  $sCall\n", &MySub;
  if ($ret = int (system ($sCall) / 256)) {
    print STDERR "ERROR: call to find BLAST hits failed\n";
    exit 1;
  }
  $CtAdded = &wc_l ($path{FoidGot});
  unless ($CtAdded) {
    print STDERR "no reads found, probably ERROR in retrieval process\n";
    exit 1;
  }

  # perform shotgun assembly
  $pCall = &CallExtClosure ($CorePath{call}{GapAssembShotgun}, $gap{full},
    $path{FoidGot}, "11.0 NO NO NO 'REPT SPSQ ENZ2' 5",
    ">$gap{dir}${OutStump}_assemble.log", "2>>$gap{dir}${OutStump}_error.log");
  $debug and printf STDERR "%s. calling process for shotgun assembly:\n  %s\n", &MySub, &$pCall ('WhatRUCalling');
  unless (&PhysicalAccessCtrl ($gap{full}, -mode=>'func', -noLock=>1,
    -mail => { ProcStamp=>"$ProgFile -$ProgMode" },
    -func=>$pCall, -log=>1)) {
    printf STDERR "ERROR: shotgun assembly call failed, call:\n  %s\n", &$pCall ('WhatRUCalling');
    exit 1;
  }
  $CtAdded = &wc_l ("$path{FoidGot}.assemb");
  printf "reads assembled into database: %d\n", $CtAdded;

  # action loop
  {
    $CtCycle ++;
    printf "\n*** CYCLE STEP %d\n", $CtCycle;

    # subprocess Gap.pl -AddRev
    $slc{len} = 1800;
    $slc{id} = '^JC2';
    $OutStump = &TimeStr (-format=>'CompactComp');
    $OutStump .= '_AddRev';
    $path{FoidGot} = "$gap{dir}${OutStump}_got.foid";
    print  "\nFIND REVERSE READS / ASSEMBLY\n";
    $sCall = "$CorePath{call}{GapAddRev} -OutStump=$OutStump -SlcLen=$slc{len} '-SlcID=$slc{id}' -FilterContam=2 $gap{full}";
    $debug and printf STDERR "%s. calling process to add reverse reads:\n  %s\n", &MySub, &$pCall ('WhatRUCalling');
    if ($ret = int (system ($sCall) / 256)) {
      print STDERR "ERROR: call to add reverse reads failed\n";
      exit 1;
    }

    # perform shotgun assembly
    $pCall = &CallExtClosure ($CorePath{call}{GapAssembShotgun}, $gap{full},
      $path{FoidGot}, "11.0 YES YES NO 'REPT SPSQ ENZ2' 50",
      ">$gap{dir}${OutStump}_assemble.log", "2>>$gap{dir}${OutStump}_error.log");
    $debug and printf STDERR "%s. calling process for shotgun assembly:\n  %s\n", &MySub, &$pCall ('WhatRUCalling');
    unless (&PhysicalAccessCtrl ($gap{full}, -mode=>'func', -noLock=>1,
      -mail => { ProcStamp=>"$ProgFile -$ProgMode" },
      -func=>$pCall, -log=>1)) {
      printf STDERR "ERROR: shotgun assembly call failed, call:\n  %s\n", &$pCall ('WhatRUCalling');
      exit 1;
    }
    $CtAdded = &wc_l ("$path{FoidGot}.assemb") + &wc_l ("$path{FoidGot}.single");
    printf "reads added to database: %d\n", $CtAdded;
    printf "reads assembled: %d\n", &wc_l ("$path{FoidGot}.assemb");

    # delete singles
    $path{FofnSingle} = "$gap{dir}${OutStump}_single.foid";
    $sCall = join (' ', $CorePath{call}{GapReadSingle}, $gap{full},
      ">$path{FofnSingle}");
    if (&GapSafeCall ($sCall, -debug=>$dbg2)) {
      printf STDERR "ERROR in getting list of singlet contigs, call:\n  %s\n", $sCall;
      exit 1;
    }
    if (-s $path{FofnSingle}) {
      printf "\ndeleting singles: %d\n", &wc_l ($path{FofnSingle});
      $pCall = &CallExtClosure ($CorePath{call}{GapReadDel}, $gap{full},
        $path{FofnSingle}, '>/dev/null');
      unless (&PhysicalAccessCtrl ($gap{full}, -mode=>'func', -noLock=>1,
        -mail => { ProcStamp=>"$ProgFile -$ProgMode" },
        -func=>$pCall, -log=>1)) {
        printf STDERR "ERROR in deleting files in target database. It may be corrupt now. Call:\n %s\n", &$pCall ('WhatRUCalling');
        exit 1;
      }
    }

print "\ndone\n";
exit 0;

    # subprocess Gap.pl -AddBlast
    print "\nBLAST / ASSEMBLY\n";
    $slc{len}  = 420;
    $slc{ends} = 500;
    $OutStump  = &TimeStr (-format=>'CompactComp');
    $OutStump .= '_AddBlast';
    $path{FoidGot} = "$gap{dir}${OutStump}_got.foid";
    $sCall = "$CorePath{call}{GapAddBlast} -SlcLen=$slc{len} -SlcEnds=$slc{ends} -OutStump=$OutStump $gap{full}";
    $debug and printf STDERR "%s. calling process to find BLAST hits:\n  $sCall\n", &MySub;
    if ($ret = int (system ($sCall) / 256)) {
      printf STDERR "ERROR: call to find BLAST hits failed, call:\n  %s\n", $sCall;
      exit 1;
    }
    $CtAdded = &wc_l ($path{FoidGot});
    unless ($CtAdded) {
      printf STDERR "no reads found, probably ERROR in retrieval process\n";
      exit 1;
    }

    # perform shotgun assembly
    $pCall = &CallExtClosure ($CorePath{call}{GapAssembShotgun}, $gap{full},
      $path{FoidGot}, "11.0 NO NO NO 'REPT SPSQ ENZ2' 50",
      ">$gap{dir}${OutStump}_assemble.log", "2>>$gap{dir}${OutStump}_error.log");
    $debug and printf STDERR "%s. calling process for shotgun assembly:\n  %s\n", &MySub, &$pCall ('WhatRUCalling');
    unless (&PhysicalAccessCtrl ($gap{full}, -mode=>'func', -noLock=>1,
      -mail => { ProcStamp=>"$ProgFile -$ProgMode" },
      -func=>$pCall, -log=>1)) {
      printf STDERR "ERROR: shotgun assembly call failed, call:\n  %s\n", &$pCall ('WhatRUCalling');
      exit 1;
    }
    $CtAdded = &wc_l ("$path{FoidGot}.assemb");
    printf "reads assembled into database: %d\n", $CtAdded;

    # cycle
    if ($CtAdded > $ThreshAddNum) { redo }
  }

#  # subprocess Gap.pl -AssembleTricky
#  print "\nTRICKY ASSEMBLY\n";
#  $slc{len} = 850;
#  $OutStump = &TimeStr (-format=>'CompactComp') . '_AssembleTricky';
#  $sCall = "$CorePath{call}{GapAssembTricky} -SlcLen=$slc{len} $gap{full}";
#  $debug and printf STDERR "%s. calling process for tricky assembly:\n  %s\n", &MySub, $sCall;
#  if ($ret = int (system ($sCall) / 256)) {
#    print STDERR "ERROR: call of tricky assembly process failed, call:\n  %s\n", $sCall;
#    exit 1;
#  }

  # ready
  print "\ndone\n";
}


# for Singlets: perform cyclic procedure of -AddRev and -AddBlast
# *** out of curation ***
#
# INTERFACE
# - argument 1: GAP4 database: (folder/)database.version
#
sub ProgAssembSingles {
  my ($PathProj) = @_;
  my (%gap, $debug, $dbg2, %slc, $OutStump, $ThreshAddNum, $bTimer, $time);
  my (%path, $sCall, $pCall, $ret);
  my ($CtCycle, $CtAdded);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $bTimer = $ProgOpt{-timer};
  $ThreshAddNum = $ProgOpt{-ThreshAddNum} || $ProgParam{default}{ThreshAddNum};

  print  "$ProgFile -$ProgMode\n";
  %gap = %{ &GapNameFields ($PathProj) };
  unless (-w $gap{full}) {
    printf STDERR "ERROR: unable to read/write GAP4 database %s", $gap{full};
    exit 1;
  }
  printf "GAP4 database: %s%s.%s\n", $gap{dir}, $gap{name}, $gap{version};

  # OutStump, self-logging
  $OutStump = &TimeStr (-format=>'CompactComp');
  unless ($ProgOpt{-log}) {
    &LogOpen (-file=>"$gap{dir}${OutStump}_${ProgMode}.log", -prog=>"$ProgFile -$ProgMode");
  }

  # delete singles
  $path{FofnSingle} = "$gap{dir}${OutStump}_single.foid";
  $sCall = join (' ', $CorePath{call}{GapReadSingle}, $gap{full},
    ">$path{FofnSingle}");
  if (&GapSafeCall ($sCall, -debug=>$dbg2)) {
    print STDERR "ERROR in getting list of singlet contigs\n";
    exit 1;
  }
  if (-s $path{FofnSingle}) {
    printf "\ndeleting singles: %d\n", &wc_l ($path{FofnSingle});
    $pCall = &CallExtClosure ($CorePath{call}{GapReadDel}, $gap{full},
      $path{FofnSingle}, '>/dev/null');
    unless (&PhysicalAccessCtrl ($gap{full}, -mode=>'func', -noLock=>1,
      -mail => { ProcStamp=>"$ProgFile -$ProgMode" },
      -func=>$pCall, -log=>1)) {
      print STDERR "ERROR in deleting files in target database. It may be corrupt now, call:\n  %s\n", &$pCall ('WhatRUCalling');
      exit 1;
    }
  }

  # subprocess Gap.pl -AddBlast over the whole contigs
  print "\nBLAST / ASSEMBLY of whole large contigs\n";
  $slc{len}  = 1000;
  $slc{ends} = 0;
  $OutStump  = &TimeStr (-format=>'CompactComp');
  $OutStump .= '_AddBlast';
  $path{FoidGot} = "$gap{dir}${OutStump}_got.foid";
  $sCall = "$CorePath{call}{GapAddBlast} -SlcLen=$slc{len} -SlcEnds=$slc{ends} -OutStump=$OutStump $gap{full}";
  $debug and printf STDERR "%s. calling process to find BLAST hits:\n  $sCall\n", &MySub;
  if ($ret = int (system ($sCall) / 256)) {
    print STDERR "ERROR: call to find BLAST hits failed\n";
    exit 1;
  }
  $CtAdded = &wc_l ($path{FoidGot});
  unless ($CtAdded) {
    print STDERR "no reads found, probably ERROR in retrieval process\n";
    exit 1;
  }

  # perform shotgun assembly
  $pCall = &CallExtClosure ($CorePath{call}{GapAssembShotgun}, $gap{full},
    $path{FoidGot}, "11.0 NO NO NO 'REPT SPSQ ENZ2' 5",
    ">$gap{dir}${OutStump}_assemble.log", "2>>$gap{dir}${OutStump}_error.log");
  $debug and printf STDERR "%s. calling process for shotgun assembly:\n  %s\n", &MySub, &$pCall ('WhatRUCalling');
  unless (&PhysicalAccessCtrl ($gap{full}, -mode=>'func', -noLock=>1,
    -mail => { ProcStamp=>"$ProgFile -$ProgMode" },
    -func=>$pCall, -log=>1)) {
    printf STDERR "ERROR: shotgun assembly call failed, call:\n  %s\n", &$pCall ('WhatRUCalling');
    exit 1;
  }
  $CtAdded = &wc_l ("$path{FoidGot}.assemb");
  printf "reads assembled into database: %d\n", $CtAdded;

  # action loop
  {
    $CtCycle ++;
    printf "\n*** CYCLE STEP %d\n", $CtCycle;

    # subprocess Gap.pl -AddRev
    $slc{len} = 1800;
    $slc{id} = '^JC2';
    $OutStump = &TimeStr (-format=>'CompactComp');
    $OutStump .= '_AddRev';
    $path{FoidGot} = "$gap{dir}${OutStump}_got.foid";
    print  "\nFIND REVERSE READS / ASSEMBLY\n";
    $sCall = "$CorePath{call}{GapAddRev} -OutStump=$OutStump -SlcLen=$slc{len} '-SlcID=$slc{id}' -FilterContam=2 $gap{full}";
    $debug and printf STDERR "%s. calling process to add reverse reads:\n  %s\n", &MySub, &$pCall ('WhatRUCalling');
    if ($ret = int (system ($sCall) / 256)) {
      print STDERR "ERROR: call to add reverse reads failed\n";
      exit 1;
    }

    # perform shotgun assembly
    $pCall = &CallExtClosure ($CorePath{call}{GapAssembShotgun}, $gap{full},
      $path{FoidGot}, "11.0 YES YES NO 'REPT SPSQ ENZ2' 50",
      ">$gap{dir}${OutStump}_assemble.log", "2>>$gap{dir}${OutStump}_error.log");
    $debug and printf STDERR "%s. calling process for shotgun assembly:\n  %s\n", &MySub, &$pCall ('WhatRUCalling');
    unless (&PhysicalAccessCtrl ($gap{full}, -mode=>'func', -noLock=>1,
      -mail => { ProcStamp=>"$ProgFile -$ProgMode" },
      -func=>$pCall, -log=>1)) {
      printf STDERR "ERROR: shotgun assembly call failed, call:\n  %s\n", &$pCall ('WhatRUCalling');
      exit 1;
    }
    $CtAdded = &wc_l ("$path{FoidGot}.assemb") + &wc_l ("$path{FoidGot}.single");
    printf "reads added to database: %d\n", $CtAdded;
    printf "reads assembled: %d\n", &wc_l ("$path{FoidGot}.assemb");

    # delete singles
    $path{FofnSingle} = "$gap{dir}${OutStump}_single.foid";
    $sCall = join (' ', $CorePath{call}{GapReadSingle}, $gap{full},
      ">$path{FofnSingle}");
    if (&GapSafeCall ($sCall, -debug=>$dbg2)) {
      printf STDERR "ERROR in getting list of singlet contigs, call:\n  %s\n", $sCall;
      exit 1;
    }
    if (-s $path{FofnSingle}) {
      printf "\ndeleting singles: %d\n", &wc_l ($path{FofnSingle});
      $pCall = &CallExtClosure ($CorePath{call}{GapReadDel}, $gap{full},
        $path{FofnSingle}, '>/dev/null');
      unless (&PhysicalAccessCtrl ($gap{full}, -mode=>'func', -noLock=>1,
        -mail => { ProcStamp=>"$ProgFile -$ProgMode" },
        -func=>$pCall, -log=>1)) {
        printf STDERR "ERROR in deleting files in target database. It may be corrupt now. Call:\n %s\n", &$pCall ('WhatRUCalling');
        exit 1;
      }
    }

print "\ndone\n";
exit 0;

    # subprocess Gap.pl -AddBlast
    print "\nBLAST / ASSEMBLY\n";
    $slc{len}  = 420;
    $slc{ends} = 500;
    $OutStump  = &TimeStr (-format=>'CompactComp');
    $OutStump .= '_AddBlast';
    $path{FoidGot} = "$gap{dir}${OutStump}_got.foid";
    $sCall = "$CorePath{call}{GapAddBlast} -SlcLen=$slc{len} -SlcEnds=$slc{ends} -OutStump=$OutStump $gap{full}";
    $debug and printf STDERR "%s. calling process to find BLAST hits:\n  $sCall\n", &MySub;
    if ($ret = int (system ($sCall) / 256)) {
      printf STDERR "ERROR: call to find BLAST hits failed, call:\n  %s\n", $sCall;
      exit 1;
    }
    $CtAdded = &wc_l ($path{FoidGot});
    unless ($CtAdded) {
      printf STDERR "no reads found, probably ERROR in retrieval process\n";
      exit 1;
    }

    # perform shotgun assembly
    $pCall = &CallExtClosure ($CorePath{call}{GapAssembShotgun}, $gap{full},
      $path{FoidGot}, "11.0 NO NO NO 'REPT SPSQ ENZ2' 50",
      ">$gap{dir}${OutStump}_assemble.log", "2>>$gap{dir}${OutStump}_error.log");
    $debug and printf STDERR "%s. calling process for shotgun assembly:\n  %s\n", &MySub, &$pCall ('WhatRUCalling');
    unless (&PhysicalAccessCtrl ($gap{full}, -mode=>'func', -noLock=>1,
      -mail => { ProcStamp=>"$ProgFile -$ProgMode" },
      -func=>$pCall, -log=>1)) {
      printf STDERR "ERROR: shotgun assembly call failed, call:\n  %s\n", &$pCall ('WhatRUCalling');
      exit 1;
    }
    $CtAdded = &wc_l ("$path{FoidGot}.assemb");
    printf "reads assembled into database: %d\n", $CtAdded;

    # cycle
    if ($CtAdded > $ThreshAddNum) { redo }
  }

  # ready
  print "\ndone\n";
}


################################################################################
# annotation concepts
################################################################################


# list oligo information from sequence tags
#
# INTERFACE
# - argument 1:  contig data source
# - argument 2*: contig specifier(s) for selection
#
# - global options
#   -debug       [STD]
#   -SlcOligo    [STD]
#   -timer       [STD]
#
sub ProgListOligo {
  my ($PathSrc, @contig) = @_;
  my ($debug, $dbg2, $bTimer, $time, $SlcOligo);
  my ($pCtgStruct, @oligo, %OligoIdx, %OligoPcr, @OligoPcr, @OligoTpl);
  my ($pTempl, @id);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $bTimer = $ProgOpt{-timer};
  $SlcOligo = $ProgOpt{-SlcOligo};

  $debug and printf "%s. analysis of database %s\n", &MySub, $PathSrc||"''";

  ##############################################################################
  # get oligo information

  # read contig source into data structure
  $bTimer and $time = &Sum ((times)[0,2]);
  unless ($pCtgStruct = &ContigStructMeta ([ $PathSrc ],
    -annotation => 1,
    -SlcContig  => \@contig,
    -timer      => $bTimer,
    -debug      => $dbg2)) {
    # error message was done in &SeqAlign::Assembly::ContigStruct
    exit 1;
  }
  # summary debug of contig set data structure possible in &SeqAlign::Assembly::ContigStruct
  $bTimer and printf STDERR "%s. CPU time for retrieving scaffold set data structure: %.3f s\n", &MySub, &Sum((times)[0,2])-$time;

  # loop over oligos
  foreach my $pTag (grep { $_->{type} eq 'OLIG' } @{$$pCtgStruct{annotation}}) {

    # parse oligo fields from annotation text
    $$pTag{text} =~ s/serial#/id/g;
    my $pOligoNew = {};
    while ($$pTag{text} =~ m/^(id|template|PairWith|PcrPair|PcrTemplate|PcrProduct|sequence)=(.+)$/img) {
      $$pOligoNew{lc $1} = $2;
      $$pOligoNew{lc $1} =~ s/(^ +| +$)//;
    }
    unless ($$pOligoNew{id} and $$pOligoNew{sequence}) { next }
    if ($SlcOligo and $$pOligoNew{id} !~ m/$SlcOligo/o) { next }

    # calculate/enter positional fields
    $$pOligoNew{contig} = $$pTag{contig}{id};
    $$pOligoNew{read} = $$pTag{read}{id};
    $$pOligoNew{offset} = ($$pTag{corient} > 0) ?
      $$pTag{cpos} : ($$pTag{cpos} + $$pTag{length} - 1);
    $$pOligoNew{end} = ($$pTag{corient} > 0) ?
      ($$pTag{cpos} + $$pTag{length} - 1) : $$pTag{cpos};
    if ($debug) {
      printf STDERR "%s. oligo %s\n", &MySub, $$pOligoNew{id}||"''";
      printf STDERR "  on contig: orient %d, offset %d, cpos %d, length %d\n",
        $$pTag{corient}, $$pOligoNew{offset}, $$pTag{cpos}, $$pTag{length};
      printf STDERR "  on read: orient %d, pos %d, length %d\n",
        $$pTag{rorient}, $$pTag{rpos}, $$pTag{length};
      printf STDERR "  contig: ID %s, length %d\n",
        $$pTag{contig}{id}, $$pTag{contig}{length};
      printf STDERR "  read: ID %s, orient %d, range %d..%d, length %d)\n",
        $$pTag{read}{id}, $$pTag{read}{orient},
        $$pTag{read}{pos}{-1}, $$pTag{read}{pos}{1},
        $$pTag{read}{length};
    }

    # enter into oligo list
    push @oligo, $pOligoNew;

    # used as a PCR primer?
    if ($$pOligoNew{pairwith} or $$pOligoNew{pcrpair}) {
      foreach (split (/,/, $$pOligoNew{pairwith} || $$pOligoNew{pcrpair})) {
        @id = sort { $a cmp $b } $$pOligoNew{id}, $_;
        $OligoPcr{$id[0]}{$id[1]} ||= $$pOligoNew{pcrtemplate} || '0';
      }
    }

    # enter into template list
    if ($$pOligoNew{template}) {
      push @OligoTpl, map { {
        id    => $_,
        oligo => $pOligoNew,
        }; }
        map { s/(^ +| +$)//; $_; }
        split (',', $$pOligoNew{template});
    }
  }

  ##############################################################################
  # process PCR information

  # oligo index
  %OligoIdx = map { ($_->{id} => $_) } @oligo;

  # set references for paired primers
  foreach my $key1 (keys %OligoPcr) {
    foreach my $key2 (keys %{$OligoPcr{$key1}}) {
      push @OligoPcr, {
        template => $OligoPcr{$key1}{$key2} || 'gDNA',
        p1       => $key1,
        p2       => $key2,
        length   =>
          ($OligoIdx{$key1}{contig} eq $OligoIdx{$key2}{contig}) ?
          ((abs ($OligoIdx{$key1}{offset}-$OligoIdx{$key2}{offset}) + 1) . ' bp') : '? [not on same contig]',
        };
    }
  }

  ##############################################################################
  # formatted output

  print "\nOLIGO LISTING\n";
  if (@oligo) {
    foreach (sort { $a->{id} cmp $b->{id} } @oligo) {
      printf "%s  %s\n", $_->{id}, $_->{sequence};
    }
    print "\nOLIGO TOPOLOGY\n";
    $PathSrc = &PathExpand ($PathSrc);
    foreach (sort { $a->{id} cmp $b->{id} } @oligo) {
      printf "%s\n", join ('  ',
        $_->{id}, $_->{sequence},
        $PathSrc, $_->{contig}, $_->{read}
        );
    }
    if (@OligoPcr) {
      print "\nPCR\n";
      foreach (sort {
        $a->{template} cmp $b->{template} or
              $a->{p1} cmp $b->{p1} or
              $a->{p2} cmp $b->{p2} 
      } @OligoPcr) {
        printf "%s  %s/%s  %s\n",
          $_->{template}, $_->{p1}, $_->{p2}, $_->{length};
      }
    }
    if (@OligoTpl) {
      foreach $pTempl (@OligoTpl) {
        $$pTempl{field} = &ReadidToFields("$$pTempl{id}.s1") || {};
      }
      print "\nSEQUENCING\n";
      foreach $pTempl (sort {
        $a->{field}{lib} cmp $b->{field}{lib} or
        $a->{field}{plt} <=> $b->{field}{plt} or
        $a->{field}{coo} cmp $b->{field}{coo} or
                $a->{id} cmp $b->{id} or
         $a->{oligo}{id} cmp $b->{oligo}{id} 
      } @OligoTpl) {
        printf "%s  %s\n", $$pTempl{id}, $$pTempl{oligo}{id};
      }
      if ((getlogin()||getpwuid($<)) =~ m/^(nadine|sandrar|sfoerste)$/) {
        print "\nSEQUENCING sorted by primer ID\n";
        foreach $pTempl (sort {
          $a->{oligo}{id} cmp $b->{oligo}{id} or
                 $a->{id} cmp $b->{id}
        } @OligoTpl) {
          printf "%s  %s\n", $$pTempl{id}, $$pTempl{oligo}{id};
        }
      }
    }
  } else {
    print "*** NONE ***\n";
  }
}


# report genes tagged genes in a GAP4 database
#
# INTERFACE
# - argument 1:  GAP4 database: (folder/)database.version
# - argument 2*: contig identifiers
#
# - global options:
#   -debug       [STD]
#   -timer       [STD]
#
# DESCRIPTION
# - this function is very similar to SeqHandle.pl -ListCds
#
sub ProgListGene {
  my ($PathProj, @contig) = @_;
  my ($debug, $dbg2, $bTimer);
  my ($pCtgStruct, $pCtg, @GeneStruct);
  my ($pTag, $pFeature, $pGene);
  my ($pSeqQueue);

  # function parameters
  $debug = $ProgOpt{-debug} = 1;
  $dbg2  = $debug ? $debug-1 : undef;
  $bTimer = $ProgOpt{-timer};

  ##############################################################################
  # get information

  # read contig source into data structure
  unless ($pCtgStruct = &ContigStructMeta ([ $PathProj ],
    -annotation => 1,
    -SlcContig  => \@contig,
    -timer      => $bTimer,
    -debug      => $dbg2)) {
    # error message is done in &ContigStruct
    exit 1;
  }

  # loop over contigs
  foreach $pCtg (values %{$$pCtgStruct{contig}}) {
    $debug and printf STDERR "%s. contig %s\n", &MySub, $$pCtg{id};

    # get consensus sequence
    $pSeqQueue = SeqLab::SeqStreamIn->new($PathProj);
    $pSeqQueue->AddSwitch (
      -SlcID => "\\.$$pCtg{idnum}\$", #"
      -debug => $dbg2);
    unless ($$pCtg{consensus} = $pSeqQueue->GetNext()->{sequence}) {
      print STDERR "ERROR in getting consensus sequence from contig %s/%s\n", $$pCtg{id}, $$pCtg{idnum};
      exit 1;
    }

    # debug gene tags
    for $pTag (sort{ $a->{cpos}<=>$b->{cpos} }
      grep { $_->{type} eq 'EXON' or $_->{type} eq 'ENZ8' }
      grep { not (exists $_->{read} and $_->{read}) } @{$$pCtg{annotation}}
    ) {
      $debug and printf STDERR "%s. annotation: pos %d, label %s\n", &MySub, $$pTag{cpos}, $$pTag{type};
    }

    # enter genes
    for $pTag (sort{ $a->{cpos}<=>$b->{cpos} }
        grep { $_->{type} eq 'EXON' }
        grep { not (exists $_->{read} and $_->{read}) }
        @{$$pCtg{annotation}}) {

      # enter new gene
      if (! ($pGene = (grep{ ($_->{pos}+$_->{length})>=$$pTag{cpos} } @GeneStruct)[0])) {
        $debug and printf STDERR "%s. new gene, pos %d\n", &MySub, $$pTag{cpos};
        $pGene = {
          pos        => $$pTag{cpos},
          orient     => $$pTag{corient},
          length     => $$pTag{length},
          contigcons => $$pCtg{consensus},
          };
        push @GeneStruct, $pGene;

      # extend gene that already exists in the growing list
      } else {
        $debug and printf STDERR "%s. overlapping EXON tag at pos %d\n%s\n", &MySub, $$pTag{cpos}, $$pTag{text};
        if ($$pTag{corient} != $$pGene{orient}) {
          $debug and print  STDERR "  overlapping EXON tag is counter-directed\n";
          next;
        }
        $$pGene{length} = &Max ($$pGene{length}, $$pTag{length}+$$pTag{cpos}-$$pGene{pos});
      }

      # read gene information from EXON tag
      while ($$pTag{text} =~ m/^(.+)=(.*)$/mg) {
        $$pGene{annotation}{$1} .= ($$pGene{annotation}{$1} ? ', ':'') . $2;
      }
      while ($$pTag{text} =~ m/^([^=]+)$/mg) {
        $$pGene{annotation}{plain} .= ($$pGene{annotation}{plain} ? "\n":'') . $2;
      }
    }

    # enter gene features
    for $pTag (sort{ $a->{cpos}<=>$b->{cpos} }
      grep{ $_->{type} eq 'ENZ8' }
      grep{ not (exists $_->{read} and $_->{read}) } @{$$pCtg{annotation}}
    ) {
      if (($pGene = (grep { $_->{pos} <= $$pTag{cpos} and ($_->{pos}+$_->{length}) > $$pTag{cpos} } @GeneStruct)[0])) {
        $debug and printf STDERR "%s. entering feature into gene pos %d\n", &MySub, $$pGene{pos};
        $$pTag{text} =~ s/\n.*$//g;
        $$pTag{text} =~ s/(SA|splice acceptor)/SpliceAccept/;
        $$pTag{text} =~ s/(SD|splice donor)/SpliceDonor/;
        if ($$pTag{text} =~ m/(SpliceAccept|SpliceDonor|start|stop)/) {
          $pFeature = {
            pos  => $$pTag{cpos},
            type => $1,
            };
          push @{$$pGene{feature}}, $pFeature;
        } else {
          printf STDERR "%s. ERROR: unknown feature: %s\n", &MySub, $$pTag{text};
          next;
        }

      # extend gene that already exists in the growing list
      } else {
        printf STDERR "%s. WARNING: homeless feature at pos %d\n", &MySub, $$pTag{cpos};
        next;
      }
    }

    # final output
    &DataPrint (\@GeneStruct);
  } # foreach $pCtg
}


# turn GlimmerM gene models into gene tagging in a GAP4 database
# *** outdated ***, this may be turned into tagging of geneid gene models
#
# INTERFACE
# - argument 1:  GAP4 database: (folder/)database.version
# - argument 2*: contig identifiers
#
# - global options:
#   -debug       [STD]
#   -MaskTag     mask these tags
#
sub ProgAnnotGlimmerM {
  my (%path, @contig);
     ($path{proj}, @contig) = @_;
  my (%gap, $debug, $dbg2, $bTimer, %MaskMe);
  my ($sCall, $pCall, $pSeqQueueIn, $pSeqQueueOut, $pSeq);
  my ($pAnnot, $MaskLen, $MaskSeq, @id, $pGff);
  my ($sTag);

  ##############################################################################
  # pre-work

  # function parameters
  %gap = %{ &GapNameFields ($path{proj}) };
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $bTimer = $ProgOpt{-timer};
  %MaskMe = map { ($_ => 1); } @{$ProgOpt{-MaskTag}};
  $debug and printf STDERR "%s. masking tags: %s\n", &MySub, join (', ', keys %MaskMe);

  # get consensus sequences
  # NOTE: we don't purify sequences, cause we need the original sequence positions
  #   according to the (possibly) gapped consensus sequence in the GAP4 DB.
  $path{cons} = $ProgParam{TmpManag}->Create(-touch=>1);
  $sCall = join (' ', $CorePath{call}{GapExportCons}, '-f Experiment',
    $path{proj}, $path{cons},
    map { s/^#.+$/'$&'/; $_; } @contig);
  if (&GapSafeCall ($sCall)) {
    printf STDERR "%s. ERROR in subprocess: %s\n", &MySub, $sCall;
    exit 1;
  }

  # mask tags in consensus sequences
  # sequence 'TAAGT' combines stop in shifting frame, splice donor and acceptor
  if (%MaskMe) {
    # debug masking
    $debug and system "nedit $path{cons} &";
    $pSeqQueueIn = SeqLab::SeqStreamIn->new($path{cons});
    $pSeqQueueIn->AddSwitch (
      -debug    => $debug ? $debug-1 : undef,
      );
    $path{consold} = $path{cons};
    $pSeqQueueOut = SeqLab::SeqStreamOut->new(
      -file     => $path{cons} = $ProgParam{TmpManag}->Create(-touch=>1),
      -debug    => $debug ? $debug-1 : undef,
      );
    while ($pSeq = $pSeqQueueIn->GetNext()) {
      $debug and printf STDERR "%s. %s contig tags in sequence %s\n", &MySub, int @{$$pSeq{tagc}}, $$pSeq{id};
      foreach $pAnnot (grep { $MaskMe{$_->{type}} and ($_->{end}-$_->{offset}+1) >= 50 }
                     @{$$pSeq{tagc}}) {
        $debug and printf STDERR "%s. found tag, range %d..%d, label %s\n", &MySub, $$pAnnot{offset}, $$pAnnot{end}, $$pAnnot{type};
        $MaskLen = $$pAnnot{end} - $$pAnnot{offset} + 1;
        $MaskSeq = 'TAAGT' x ($MaskLen / 5 + 1);
        substr ($$pSeq{sequence}, $$pAnnot{offset}-1, $MaskLen) =
          substr ($MaskSeq, 0, $MaskLen);
      }
      $pSeqQueueOut->Push ($pSeq);
    }
    # debug masking
    $debug and system "nedit $path{cons} &";
  }

  # get list of translated contig identifiers
  $sCall = join (' ', $CorePath{call}{SeqID}, $path{cons}, '|');
  unless (@id = map { s/$gap{name}\.$gap{version}\.(\d+)/'#$1'/; $_; } @{ &PlainToTable ($sCall, -TabType=>'A1') }) {
    printf STDERR "%s. ERROR in subprocess: %s\n", &MySub, $sCall;
    exit 1;
  }
  $sCall = join (' ', $CorePath{call}{GapContigIds}, $path{proj},
    $path{idtransl} = $ProgParam{TmpManag}->Create(-touch=>1),
    @id);
  &GapSafeCall ($sCall);
  unless (@id = @{ &PlainToTable ($path{idtransl}, -TabType=>'A1') }) {
    printf STDERR "%s. ERROR in subprocess: %s\n", &MySub, $sCall;
    exit 1;
  }

  ##############################################################################
  # perform GlimmerM analysis
  # translate GlimmerM => GFF => Experiment tags

  $path{tag} = $ProgParam{TmpManag}->Create(-touch=>1);

  # loop over consensus sequences of contigs
  $pSeqQueueIn = SeqLab::SeqStreamIn->new($path{cons});
  $pSeqQueueIn->AddSwitch (
    -debug    => $debug ? $debug-1 : undef,
    );
  while ($pSeq = $pSeqQueueIn->GetNext()) {
    $debug and printf STDERR "%s. contig %s / %s\n", &MySub, $$pSeq{id}, $id[0];

    # assign correct contig identifier
    $$pSeq{id} = shift @id;

    # save sequence
    unless (&WriteFile ($path{seq}=$ProgParam{TmpManag}->Create(-touch=>1),
      &SeqentryToFasta($pSeq))) {
      printf STDERR "%s. ERROR when saving sequence entry %s\n", &MySub, $$pSeq{id}||"''";
      next;
    }

    # prepare GlimmerM prediction in Experiment format
    $path{pwd} = &PathCwd();
    chdir $CorePath{glimmerM}{home};
    $pCall = &CallExtClosure ($CorePath{glimmerM}{call}, $path{seq},
      $CorePath{glimmerM}{delta},
      '|', $CorePath{call}{GlimmerToGff}, "-i $$pSeq{id} -m", '-',
      '|', $CorePath{call}{GffToExper}, '-',
      ">> $path{tag}",
      );
    unless (&$pCall) {
      printf STDERR "%s. ERROR in subprocess: %s, ret code: %d\n", &MySub,
        &$pCall ('WhatRUCalling'), &$pCall ('WhatWasRet');
      exit 1;
    }
    chdir $path{pwd};

    # debug output
    $debug and system "nedit $path{tag}";
  }

  # tidy up
  $debug or unlink (grep{-e $_}
    $path{cons}, $path{consold}, $path{idtransl}, $path{tag}, $path{seq});
}


# remove redundant tags in Experiment sequences
#
# INTERFACE
# - argument 1*: Experiment files, default: files listed in ./fofn
#
# - global options:
#   -debug       [STD]
#
# DESCRIPTION
# - The tagged sequences will be written back to the file path of the
#   sequence source file only if changes were made.
# - This code not only changes/deletes tags, it also changes/deletes template
#   entries
#
sub ProgAnnotReduce {
  my (@ExperArg) = @_;
  my ($bMe, $debug, $dbg2);
  my ($pSeqQueueIn, $pSeq, $pSeq2, @SeqStore, $pSeqQueueOut);
  my ($pTag, $pTagPrev, $pTagSlc, %TagMislead, $pGrp, $SeqSlice);
  my ($CtTag, $CtTagPrev, %ProcLog, $bMsgIncl);

  ##############################################################################
  # pre-work

  # function parameters
  $bMe = ((getlogin()||getpwuid($<)) eq 'szafrans') ? 1 : 0;
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  print  "$ProgFile -$ProgMode: remove redundant tags\n";
  if (!@ExperArg and -r './fofn') {
    @ExperArg = @{ &LoadFoid ('./fofn') };
    printf "  %d IDs found in ./fofn\n", int @ExperArg;
    if (-r './ContigTags') {
      push @ExperArg, './ContigTags';
    }
  }

  # initialize sequence queues
  $pSeqQueueOut = SeqLab::SeqStreamOut->new(
     -file   => 'rewrite',
      $dbg2 ?
    (-debug  => $dbg2) : (),
    );
  $pSeqQueueIn = SeqLab::SeqStreamIn->new(@ExperArg);
  $pSeqQueueIn->AddSwitch (
    -debug => $debug ? $debug-1 : undef,
    );

  ##############################################################################
  # loop over sequence files and sequences

  # loop over input sequences
  while ($pSeq = $pSeqQueueIn->GetNext()) {
    if ($$pSeq{SrcFmt} ne 'Experiment') {
      printf STDERR "ERROR: sequence %s is not in Experiment file format, but is %s\n",
        $$pSeq{id}||"''", $$pSeq{SrcFmt}||"''";
      next;
    }
    $ProcLog{CtSeq} ++;

    # changes in most recent source file? => rewrite buffered sequences
    # code is nearly the same as below (outside while loop)
    if (@SeqStore and $$pSeq{SrcPath} ne $SeqStore[0]{SrcPath}) {
      if ($ProcLog{CurrTagChg} or $ProcLog{CurrTagDel} or $ProcLog{CurrTemplChg}) {

        # update global counters
        $ProcLog{GlobTagDel} += $ProcLog{CurrTagDel};
        $ProcLog{GlobTagChg} += $ProcLog{CurrTagChg};
        $ProcLog{GlobTemplChg} += $ProcLog{CurrTemplChg};
        $ProcLog{CurrTemplChg} = $ProcLog{CurrTagChg} = $ProcLog{CurrTagDel} = 0;

        # re-write sequence to file
        unless ($pSeqQueueOut->Push (@SeqStore)) {
          printf STDERR "%s. ERROR: unable to rewrite one of these sequences: %s\n", &MySub,
            join (map { $_->{id} } $SeqStore[0 .. &Min ($#SeqStore, 5)]);
        }
      }

      # empty sequence buffer
      @SeqStore = ();
    }

  ##############################################################################
  # delete template 'NONE'

    # find basic template for version-numbered sequence IDs
    if ($$pSeq{TN} =~ m/\.\d+$/ and (! $$pSeq{LN} or $$pSeq{LN} eq 'unknown')) {
      $$pSeq{TN} = $`;
      $ProcLog{CurrTemplChg} ++;
    }

    # delete template 'NONE'
    if ($$pSeq{TN} eq 'NONE') {
      if ((! $$pSeq{LN} or $$pSeq{LN} eq 'unknown') and $$pSeq{id} =~ m/\.\d+$/) {
        $$pSeq{TN} = $`;
      } else {
        delete $$pSeq{TN};
      }
      $ProcLog{CurrTemplChg} ++;
    }

  ##############################################################################
  # loop over annotations in sequence data structure

    # tags are assumed to be in order of offset
    AnnotReduceTag:
    for ($CtTag=0; $CtTag<@{$$pSeq{annot}}; $CtTag++) {
      $pTag = $$pSeq{annot}[$CtTag];
      $ProcLog{CtTagAll} ++;

  ##############################################################################
  # delete superficious tags

      # delete all these tags
      if (($$pTag{type} eq 'SPSQ' and $$pTag{text} =~ m/^(Chr|mtDNA)/i) or
           $$pTag{text} =~ m/^UNK-2(_|$)/i or
           $$pTag{text} =~ m/^UNK-15/i or
           $$pTag{text} =~ m/^UNK-16/i or
           0) {
        splice @{$$pSeq{annot}}, $CtTag, 1; $ProcLog{CurrTagDel} ++;
        $CtTag --;
        next AnnotReduceTag;  # turn to next annotation in sequence
      }

      # warning on these tags
      if (0) {
        printf STDERR "%s. found suspicious tag, label %s, text:\n%s\n", &MySub, $$pTag{type}, $$pTag{text};
      }

  ##############################################################################
  # delete doublette tags

      # delete tags redundant in respect of identical range
      # loop over tags
      if ($CtTag>0 and $$pSeq{annot}[$CtTag-1]{offset} == $$pTag{offset}) {
        AnnotReduceDoubl: {
          $CtTagPrev = 0;
          foreach $pTagPrev (grep { $_ ne $pTag and
              $_->{layer}  eq $$pTag{layer} and
              $_->{offset} == $$pTag{offset} and
              $_->{end}    == $$pTag{end} and
             ($_->{type}   eq $$pTag{type} or ($_->{type}.' '.$$pTag{type}) =~ m/\b(REPT|SPSQ)\b/)
              } map {
                $_->{num} = $CtTagPrev++;
                $_->{text} =~ m/^[\w.-]+/ and $_->{Label2nd} = $&;
                $_->{text} =~ m/^rel(\. ?|_)identity[ =]([01]\.[0-9]{2,3})$/m and $_->{RelID} = $2;
                $_;
              } @{$$pSeq{annot}}[0 .. $CtTag]) {

            # mis-oriented doublette
            if ($$pTagPrev{orient} != $$pTag{orient} and
               ($$pTagPrev{type}.' '.$$pTag{type}) !~ m/\b(REPT|SPSQ)\b/) {
              $debug||$bMe and printf STDERR "%s. mis-oriented doublette, seq %s, num %d/%d, range %d..%d %s/%s, label %s & %s, layer %s\n", &MySub,
                $$pSeq{id}, $$pTagPrev{num}, $CtTag, $$pTag{offset}, $$pTag{end}, $$pTagPrev{orient}, $$pTag{orient}, $$pTag{type}, $$pTag{Label2nd}, $$pTag{layer}||'**NONE**';
              next;
            }

            # exact doublette
            if ($$pTag{text} eq $$pTagPrev{text}) {
              $debug and printf STDERR "%s. definitely found doublette (exact), seq %s, num %d/%d, range %d..%d %s/%s, label %s, layer %s\n", &MySub,
                $$pSeq{id}, $$pTagPrev{num}, $CtTag, $$pTag{offset}, $$pTag{end}, $$pTagPrev{orient}, $$pTag{orient}, $$pTag{type}, $$pTag{layer}||'**NONE**';
              splice @{$$pSeq{annot}}, $CtTag, 1; $ProcLog{CurrTagDel} ++;
              $CtTag --;
              next AnnotReduceTag;  # turn to next annotation in sequence
            }

            # delete doublette of label 'SPSQ' versus any
            if ($$pTagPrev{type} eq 'SPSQ') {
              $debug and printf STDERR "%s. definitely found doublette (SPSQ), seq %s, num %d(/%d), range %d..%d, label %s\n", &MySub,
                $$pSeq{id}, $$pTagPrev{num}, $CtTag, $$pTagPrev{offset}, $$pTagPrev{end}, $$pTagPrev{type};
              splice @{$$pSeq{annot}}, $$pTagPrev{num}, 1; $ProcLog{CurrTagDel} ++;
              $CtTag --;
              redo AnnotReduceDoubl;  # check again for redundancies against current annotation
            }
            if ($$pTag{type} eq 'SPSQ') {
              $debug and printf STDERR "%s. definitely found doublette (SPSQ), seq %s, num (%d/)%d, range %d..%d, label %s\n", &MySub,
                $$pSeq{id}, $$pTagPrev{num}, $CtTag, $$pTag{offset}, $$pTag{end}, $$pTag{type};
              splice @{$$pSeq{annot}}, $CtTag, 1; $ProcLog{CurrTagDel} ++;
              $CtTag --;
              next AnnotReduceTag;  # turn to next annotation in sequence
            }

            if ($$pTag{type} eq 'REPT') {

              # doublette where both have label 'REPT', both have rel.identity field
              # drop the one showing lower score ('rel.identity')
              if ($$pTagPrev{RelID} and $$pTag{RelID}) {
                if ($$pTagPrev{RelID} <= $$pTag{RelID}) {
                  $debug and printf STDERR "%s. definitely found doublette (REPT, lower RelID), seq %s, num %d(/%d), range %d..%d, label %s\n", &MySub,
                    $$pSeq{id}, $$pTagPrev{num}, $CtTag, $$pTagPrev{offset}, $$pTagPrev{end}, $$pTagPrev{type};
                  splice @{$$pSeq{annot}}, $$pTagPrev{num}, 1; $ProcLog{CurrTagDel} ++;
                  $CtTag --;
                  redo AnnotReduceDoubl;  # check again for redundancies against current annotation
                } else {
                  $debug and printf STDERR "%s. definitely found doublette (REPT, lower RelID), seq %s, num (%d/)%d, range %d..%d, label %s\n", &MySub,
                    $$pSeq{id}, $$pTagPrev{num}, $CtTag, $$pTag{offset}, $$pTag{end}, $$pTag{type};
                  splice @{$$pSeq{annot}}, $CtTag, 1; $ProcLog{CurrTagDel} ++;
                  $CtTag --;
                  next AnnotReduceTag;  # turn to next annotation in sequence
                }
              }

              # doublette where both have label 'REPT', same ID ('2nd label')
              # drop any of both
              if ($$pTagPrev{Label2nd} and $$pTagPrev{Label2nd} eq $$pTag{Label2nd}) {
                $debug and printf STDERR "%s. definitely found doublette (REPT, same 2nd label, no score), seq %s, num (%d/)%d, range %d..%d, label %s & %s\n", &MySub,
                  $$pSeq{id}, $$pTagPrev{num}, $CtTag, $$pTag{offset}, $$pTag{end}, $$pTag{type}, $$pTag{Label2nd};
                splice @{$$pSeq{annot}}, $CtTag, 1; $ProcLog{CurrTagDel} ++;
                $CtTag --;
                next AnnotReduceTag;  # turn to next annotation in sequence
              }
            }

            # report remaining non-REPT doublettes
            elsif ($$pTag{type} !~ m/^(MASK|REPT|SPSQ)$/ and $debug||$bMe) {
              printf STDERR "%s. possibly found doublette tag, seq %s, num %d/%d, range %d..%d, label %s, layer %s, texts:\n", &MySub,
                $$pSeq{id}, $$pTagPrev{num}, $CtTag, $$pTag{offset}, $$pTag{end}, $$pTag{type}, $$pTag{layer}||'**NONE**';
              print  STDERR map { "  $_\n" } split (/\n/, $$pTagPrev{text}), '-' x 30, split (/\n/, $$pTag{text});
            }
          }  # end foreach $pTagPrev
        }  # end AnnotReduceDoubl
      }

  ##############################################################################
  # join/delete overlapping tags

      # always join these tag types
      # these tags are orientation-less
      if (($$pTag{type} =~ m/^(CVEC|ENZ2)$/ or ($$pTag{type} eq 'EXON' and $$pTag{layer} eq 'cons')) and
          $pTagPrev = (grep {
            $_->{layer} eq $$pTag{layer} and
            $_->{type}  eq $$pTag{type} and
            $_->{end}   >= $$pTag{offset}
          } @{$$pSeq{annot}}[0 .. ($CtTag-1)] )[0] ) {
        $debug and printf STDERR "%s. joining overlapping %s tags - seq %s, range %d..%d / %d..%d\n", &MySub, $$pTag{type}, $$pSeq{id}, $$pTag{offset}, $$pTag{end}, $$pTagPrev{offset}, $$pTagPrev{end};
        if ($$pTag{text} and $$pTagPrev{text}) {
          printf STDERR "CONFLICT: overlapping %s tags both have text fields - seq %s, range %d..%d / %d..%d\n", $$pTag{type}, $$pSeq{id}, $$pTag{offset}, $$pTag{end}, $$pTagPrev{offset}, $$pTagPrev{end};
        } elsif ($$pTag{orient} != $$pTagPrev{orient}) {
          printf STDERR "CONFLICT: overlapping %s tags have conflicting strandedness - seq %s, range %d..%d / %d..%d\n", $$pTag{type}, $$pSeq{id}, $$pTag{offset}, $$pTag{end}, $$pTagPrev{offset}, $$pTagPrev{end};
        } else {
          $$pTagPrev{text} ||= $$pTag{text};
          $$pTagPrev{end} = &Max ($$pTag{end}, $$pTagPrev{end});
          splice @{$$pSeq{annot}}, $CtTag, 1; $ProcLog{CurrTagDel} ++;
          $CtTag --;
          next AnnotReduceTag;  # turn to next annotation in sequence
        }
      }

      # delete REPT/SPSQ annotations if showing included/including range
      # REMEMBER: tags are expected to be in order of their offset positions
      # loop over previous tags
      if ((getlogin()||getpwuid($<)) eq 'szafrans' and $$pTag{type}=~m/^(MASK|REPT|SPSQ)$/) {
        %TagMislead = (
          'thug-S' => { microsatellite_TAA=>'thug-S' },
          'TRE5-B' => { microsatellite_CAA=>'TRE5-B' },
          microsatellite_CAA => { 'thug-S'=>'thug-S' },
          microsatellite_TAA => { 'TRE5-B'=>'TRE5-B' },
          );
        AnnotReduceOlap: {
          $CtTagPrev = 0;
          foreach $pTagPrev (grep { $_ ne $pTag and
              $_->{layer} eq $$pTag{layer} and
              $_->{end}   >= $$pTag{end} and
              $_->{type}  =~ m/^(MASK|REPT|SPSQ)$/
              } map {
                $_->{num} = $CtTagPrev ++;
                $_->{text} =~ m/^[\w.-]+/ and $_->{Label2nd} = $&;
                $_->{text} =~ m/^rel(\. ?|_)identity[ =]([01]\.[0-9]{2,3})$/m and $_->{RelID} = $2;
                $_->{length} = $_->{end} - $_->{offset} + 1;
                $_;
              } @{$$pSeq{annot}}[0 .. $CtTag]) {

            # misleading subtypes 
            # - e.g. thug-S, TRE5-B etc.
            if ($TagMislead{$$pTag{Label2nd}}{$$pTagPrev{Label2nd}} and
                &Max (map { $_->{length} } $pTag, $pTagPrev) / &Min (map { $_->{length} } $pTag, $pTagPrev) < 1.2 and
                $pTagSlc = (grep { $_->{Label2nd} and $_->{Label2nd} eq $TagMislead{$$pTag{Label2nd}}{$$pTagPrev{Label2nd}} } $pTag, $pTagPrev)[0]) {
              $debug||$bMe and printf STDERR "%s. misleading repeat subtype %s, seq %s, num %d/%d(%d), range %d..%d over %d..%d, label %s, layer %s, texts:\n", &MySub,
                $$pTagSlc{Label2nd}, $$pSeq{id}, $$pTagPrev{num}, $CtTag, $$pTagSlc{num},
                $$pTagPrev{offset}, $$pTagPrev{end}, $$pTag{offset}, $$pTag{end}, $$pTag{type}, $$pTag{layer}||'**NONE**';
              $debug||$bMe and print  STDERR map { "  $_\n" } split (/\n/, $$pTagPrev{text}), '-' x 30, split (/\n/, $$pTag{text});
              $ProcLog{CurrTagChg} += ($$pTagSlc{text} =~
                s/$$pTagSlc{Label2nd}/(grep { $_->{Label2nd} ne $$pTagSlc{Label2nd} } $$pTag{Label2nd}, $$pTagPrev{Label2nd})[0]/e);
            }

            # same subtype, inner one lower/equal scoring than the outer one
            # => remove the inner (current) one
            if ($$pTag{Label2nd} and $$pTag{Label2nd} eq $$pTagPrev{Label2nd} and
                $$pTag{RelID} and $$pTag{RelID} <= $$pTagPrev{RelID}) {
              $debug and printf STDERR "%s. redundant overlapping tag (weak inner), seq %s, num %d(%d), range %d..%d over %d..%d, label %s, layer %s, texts:\n", &MySub,
                $$pSeq{id}, $$pTagPrev{num}, $CtTag,
                $$pTagPrev{offset}, $$pTagPrev{end}, $$pTag{offset}, $$pTag{end}, $$pTag{type}, $$pTag{layer}||'**NONE**';
              $debug and print  STDERR map { "  $_\n" } split (/\n/, $$pTagPrev{text}), '-' x 30, split (/\n/, $$pTag{text});
              splice @{$$pSeq{annot}}, $CtTag, 1; $ProcLog{CurrTagDel} ++;
              $CtTag --;
              next AnnotReduceTag;  # turn to next annotation in sequence
            }

            # same subtype, highly similar range, inner one higher scoring than the outer one
            # => remove the weak outer (previous) one
            if ($$pTag{Label2nd} and $$pTag{Label2nd} eq $$pTagPrev{Label2nd} and
                abs ($$pTag{length} - $$pTagPrev{length}) <= 5 and
                $$pTagPrev{RelID} and $$pTagPrev{RelID} < $$pTag{RelID}) {
              $debug and printf STDERR "%s. redundant overlapping tag (sim., weak outer), seq %s, num (%d)%d, range %d..%d over %d..%d, label %s, layer %s, texts:\n", &MySub,
                $$pSeq{id}, $$pTagPrev{num}, $CtTag,
                $$pTagPrev{offset}, $$pTagPrev{end}, $$pTag{offset}, $$pTag{end}, $$pTag{type}, $$pTag{layer}||'**NONE**';
              $debug and print  STDERR map { "  $_\n" } split (/\n/, $$pTagPrev{text}), '-' x 30, split (/\n/, $$pTag{text});
              splice @{$$pSeq{annot}}, $$pTagPrev{num}, 1; $ProcLog{CurrTagDel} ++;
              $CtTag --;
              redo AnnotReduceOlap;  # check again for redundancies against current annotation
            }

            # highly similar range, one very weak
            # => remove the very weak one
            if (abs ($$pTag{length} - $$pTagPrev{length}) < 15 and
                $pTagSlc = (grep { $_->{RelID} and $_->{RelID} < 0.890 } $pTag, $pTagPrev)[0]) {
              $debug||$bMe and printf STDERR "%s. redundant overlapping tag (sim., very weak), seq %s, num %d/%d/(%d), range %d..%d over %d..%d, label %s, layer %s, texts:\n", &MySub,
                $$pSeq{id}, $$pTagPrev{num}, $CtTag, $$pTagSlc{num},
                $$pTagPrev{offset}, $$pTagPrev{end}, $$pTag{offset}, $$pTag{end}, $$pTag{type}, $$pTag{layer}||'**NONE**';
              $debug||$bMe and print  STDERR map { "  $_\n" } split (/\n/, $$pTagPrev{text}), '-' x 30, split (/\n/, $$pTag{text});
              splice @{$$pSeq{annot}}, $$pTagSlc{num}, 1; $ProcLog{CurrTagDel} ++;
              $CtTag --;
              if ($pTagSlc eq $pTag) {
                next AnnotReduceTag;  # turn to next annotation in sequence
              } else {
                redo AnnotReduceOlap;  # check again for redundancies against current annotation
              }
            }

            # report remaining cases
            if ($bMsgIncl < 3) {
              printf STDERR "%s. possibly found included tag, seq %s, num %d/%d, range %d..%d over %d..%d, label %s, layer %s, texts:\n", &MySub,
                $$pSeq{id}, $$pTagPrev{num}, $CtTag,
                $$pTagPrev{offset}, $$pTagPrev{end}, $$pTag{offset}, $$pTag{end}, $$pTag{type}, $$pTag{layer}||'**NONE**';
              print  STDERR map { "  $_\n" } split (/\n/, $$pTagPrev{text}), '-' x 30, split (/\n/, $$pTag{text});
              printf STDERR "  2nd label: %s / %s\n", $$pTagPrev{Label2nd}, $$pTag{Label2nd};
              printf STDERR "  rel.identities: %s / %s\n", $$pTagPrev{RelID}, $$pTag{RelID};

              # unable to resolve
              if ($$pTag{Label2nd} ne $$pTagPrev{Label2nd}) {
                next;
              } else {
                $bMsgIncl ++;
                last;
              }
            }
          }  # end foreach $pTagPrev
        }  # end AnnotReduceOlap
      }

  ##############################################################################
  # change tag text syntax

      # delete empty text
      $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^\?$//);

      # global text syntax
      $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/\bchimaeric\b/chimerical/);

      # oligo ID syntax
      if ($$pTag{type} eq 'OLIG') {
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^(id|serial#)(=JP)(\d{3})$/serial#${2}0$3/m);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/\nflags=(\n|$)/$1/s);
      }

      # repeat tags:
      if ($$pTag{type} =~ m/^(REPT|SPSQ)$/) {

        # repeat ID syntax
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^a21d03.*\nrange.+/thug-M/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^a32h11.*\nrange.+/thug-L/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^a88h05.*\nrange.+/thug-L/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^a91a10.*\nrange.+/thug-M/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^a123h11.*\nrange.+/DDE-A/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^a131e11.*\nrange.+/DDE-M/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^a154g09.*\nrange.+/DDE-A/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^a166h05.*\nrange.+/DDE-A/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^a239e06.*\nrange.+/thug-M/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^DDE/DDT/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^DDT-L/DDT-S/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^DRE/TRE5-A/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^H3R.*\nrange.+/DGLT-A1/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^RED/TRE3-B/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^Tdd-3/TRE3-A/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^Tdd-5(\d)/UNK-1$1/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^thug-L.*/thug-S/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^thug-M.*/thug-T/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^TRE5-A_Div302/TRE5-A_ModC2/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^UNK-1\b/Tdd-5/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^UNK-3.*/DDT-S/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^UNK-4.*/DDT-B/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^UNK-6.*/DDT-B/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^UNK-8.*/TRE5-B/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^UNK-9.*/TRE3-C/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^UNK-10.*/DDE-A/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^UNK-11.*/DDE-M/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^UNK-13.*/DGLT-P/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^UNK-14.*/TRE3-D/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^UNK-17.*/thug-S/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^UNK-18.*/thug-T/);
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^Uno/DDT/);

        # undo annotation attribute syntax bug
        #$ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^(.+\n)+(.+range.+\nrel\. identity.+)$/$2/s);

        # delete empty description attribute
        $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/(^|\n)(description=)($|\n)/$1/is);
      }

  ##############################################################################
  # check orientation syntax

      # always direction-less
      if ($$pTag{type} =~ m/^(CVEC|ENZ0|ENZ2|NNNN)$/ and $$pTag{orient} != 0) {
        $$pTag{orient} = 0;
        $ProcLog{CurrTagChg} ++;
      }

      # was mis-changed once
      if ($$pTag{type} =~ m/^(EXON|XXX)$/ and $$pTag{layer} ne 'cons' and
          $$pTag{end} == length ($$pSeq{sequence}) and
          $$pTag{orient} != 1) {
        $debug||$bMe and printf STDERR "%s. forcing tag orientation to seq orientation, seq %s, tag %s\n", &MySub, $$pSeq{id}, $$pTag{type};
        $$pTag{orient} = 1;
        $ProcLog{CurrTagChg} ++;
      }

  ##############################################################################
  # change gene annotation syntax

      # introns, splice sites
      if ($$pTag{type} eq 'ENZ8' and $$pTag{layer} eq 'cons') {

        # create splice site label according to tagged sequence
        if (! $$pTag{text} and $$pTag{end}-$$pTag{offset} == 1) {
          $pSeq2 = $$pSeq{sequence} ? $pSeq :
            &{$SeqFFmtGet{Experiment}{FuncEntry}} (&PathSplit($$pSeq{SrcPath})->{dir}.'/ContigConsensi', -MatchID=>[$$pSeq{id}]);
          $$pSeq{sequence} or printf STDERR "%s. loaded seq string from file %s\n", &MySub, &PathSplit($$pSeq{SrcPath})->{dir}.'/ContigConsensi';
          $SeqSlice = substr ($$pSeq2{sequence}, $$pTag{offset}-1, 2);
          if ($$pTag{orient} < 0 ) { $SeqSlice = &SeqStrRevcompl ($SeqSlice); }
          if ($SeqSlice eq 'GT') {
            $$pTag{text} = 'SpliceDonor';
            printf STDERR "%s. completing splice donor annotation - seq %s, range %d..%d\n", &MySub, $$pSeq{id}, $$pTag{offset}, $$pTag{end};
            $ProcLog{CurrTagChg} ++;
          } elsif ($SeqSlice eq 'AG') {
            $$pTag{text} = 'SpliceAccept';
            printf STDERR "%s. completing splice acceptor annotation - seq %s, range %d..%d\n", &MySub, $$pSeq{id}, $$pTag{offset}, $$pTag{end};
            $ProcLog{CurrTagChg} ++;
          } else {
            printf STDERR "%s. WARNING: strange %s tag - seq %s, range %d..%d, seq string %s\n", &MySub, $$pTag{type}, $$pSeq{id}, $$pTag{offset}, $$pTag{end}, $SeqSlice||"MISSING";
          }
        }

        # split intron tag to splice sites
        if ($$pTag{text} =~ m/^intron\b/ and ($$pTag{end}-$$pTag{offset}) > 4) {
          $debug||$bMe and printf STDERR "%s. splitting intron tag to splice sites: %d..%d\n", &MySub,
            $$pTag{offset}, $$pTag{end};
          push @{$$pSeq{annot}}, { %$pTag };
          $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^intron\b/SpliceDonor/s);
          $$pSeq{annot}[-1]{text} =~ s/^intron\b/SpliceAccept/s;
          if ($$pTag{orient} > 0) {
            $$pSeq{annot}[-1]{offset} = $$pTag{end}-1;
            $$pSeq{annot}[-1]{end} = $$pTag{end};
            $$pTag{end} = $$pTag{offset}+1;
          } else {
            $$pTag{offset} = $$pSeq{annot}[-1]{end}-1;
            $$pTag{end} = $$pSeq{annot}[-1]{end};
            $$pSeq{annot}[-1]{end} = $$pSeq{annot}[-1]{offset}+1;
          }
        }

        # orient transcript signal parallel to transcript
        $$pTag{text} =~ m/^[\w.-]+/ and $$pTag{Label2nd} = $&;
        $pGrp = [ grep {
          $_->{type} eq 'EXON' and $_->{layer} eq 'cons' and
          $_->{offset} <= $$pTag{offset} and $_->{end} >= $$pTag{end}
          } @{$$pSeq{annot}} ];
        if (@$pGrp == 1 and $$pGrp[0]{orient} and $$pGrp[0]{orient} != $$pTag{orient}) {
          $debug||$bMe and printf STDERR "%s. changing transcript signal orientation, seq %s, transcript %s, range %d..%d\n", &MySub, $$pSeq{id},
            ($$pGrp[0]{text} =~ m/^(gene|id)=([^?]+)$/m) ? $2 : '', $$pGrp[0]{offset}, $$pGrp[0]{end};
          $debug||$bMe and printf STDERR "  signal range %d..%d %s, label %s\n", $$pTag{offset}, $$pTag{end}, $$pTag{orient}, $$pTag{type};
          $$pTag{orient} = $$pGrp[0]{orient};
          $ProcLog{CurrTagChg} ++;
        }

        # others
        else {
          $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^splice acceptor\b/SpliceAccept/i);
          $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^SA\b/SpliceAccept/);
          $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^splice donor\b/SpliceDonor/i);
          $ProcLog{CurrTagChg} += ($$pTag{text} =~ s/^SD\b/SpliceDonor/);
        }
      }

    } # for ... $pTag

    # store sequences originating from same file
    push @SeqStore, $pSeq;
    $debug and printf STDERR "%s. sequence %s pushed to buffer\n", &MySub, $$pSeq{id};

    undef $pSeq2;
  } # while $pSeq

  ##############################################################################
  # post-work: print summary

  # changes in most recent source file? => rewrite buffered sequences
  # code is nealy the same as above (early inside previous while loop)
  if ($ProcLog{CurrTagChg} or $ProcLog{CurrTagDel} or $ProcLog{CurrTemplChg}) {

    # update global counters
    $ProcLog{GlobTagDel} += $ProcLog{CurrTagDel};
    $ProcLog{GlobTagChg} += $ProcLog{CurrTagChg};
    $ProcLog{GlobTemplChg} += $ProcLog{CurrTemplChg};

    # re-write sequence to file
    unless ($pSeqQueueOut->Push (@SeqStore)) {
      printf STDERR "%s. ERROR: unable to rewrite one of these sequences: %s\n", &MySub,
        join (map { $_->{id} } $SeqStore[0 .. &Min ($#SeqStore, 5)]);
    }
  }

  # report changes
  printf "  sequences inspected: %d\n", $ProcLog{CtSeq};
  printf "  templates changed: %d\n", $ProcLog{GlobTemplChg};
  printf "  tags: %d\n", $ProcLog{CtTagAll};
  printf "  tags deleted: %d\n", $ProcLog{GlobTagDel};
  printf "  tag syntax changes: %d\n", $ProcLog{GlobTagChg};
}
# $Id: GscjGap.pl,v 1.38 2018/06/05 18:02:56 szafrans Exp $
