#! /usr/local/bin/perl
################################################################################
#
#  Sequence Laboratory
#  Do All Kinds of Sequence Handling and Manipulation
#
#  copyright (c)
#  Karol Szafranski on behalf of IMB Jena, Dept. Genome Analysis, 1998-2004,
#    szafrans@imb-jena.de
#  Karol Szafranski at UPenn Philadelphia, Center for Bioinformatics, 2004,
#    karol@pcbi.upenn.edu
#  Karol Szafranski on behalf of FLI Jena, Genome Analysis Group, 2005-2006,
#    szafrans@fli-leibniz.de
#
################################################################################
#
#  DESCRIPTION
#
# - See function &usage for description of command line syntax
#
# - this program does not intend to cover alignment functionalities.
#   See Align.pl for that.
#
# - dependencies, interfaces:
#   - for supported sequence input file formats see
#     - %SeqLab::SeqFormat::SeqFFmtGet
#   - for supported sequence output file formats see
#     - %SeqLab::SeqFormat::SeqFFmtOutFunc
#
# - each function comes along with a description at the beginning of the code
#   block
#
################################################################################
#
#  FUNCTIONS, DATA
#
# - MAIN
#   %GlobStore
#   $ProgFile,$ProgFstump
#   %ProgParam
#   $ProgMode,%ProgOpt,@ProgArg
#
# - usage help, command line arguments
#   &usage
#   &AddSwitch
#
# - basic I/O
#   &SeqQueue
#   &PrepOstump
#   &OutTabIntro
#   &OutTabHeadln
#   &OutTabLine
#   &OutTabTail
#
# - sequence transformation
#   &ProgTemp*
#   &ProgIdChg
#   &ProgRevcompl
#   &ProgTransl
#   &ProgRetransl
#   &ProgGffToExper
#   &ProgBisulfite
#
# - sequence fragmentation and concatenation
#   &ProgBreak
#   &ProgCutMotif
#   &ProgListNblock
#   &ProgCutHTGS
#   &ProgUDG
#   &ProgConcat
#
# - sequence annotation topology and feature extraction
#  - annotation only
#   &ProgAnnotRen
#   &ProgAnnotExpand
#   &ProgAnnotRrnaConst
#   &ProgAnnotRrna
#  - annotation to sequence
#   &ProgAnnotCase
#   &ProgAnnotMask
#   &ProgAnnotCatseq
#   &ProgIntgenBactCatseq
#   &ProgIntgenDictyCatseq
#   &ProgIntron1stCatseq
#   &ProgCatCds
#     [outdated? No, implement the -SlcValid feature in &ProgAnnotCatseq!]
#
# - sequence analysis
#   &ProgLowcplx
#   &ProgLowcplxAnnot
#   &ProgLowcplxMask
#   &ProgSiteHappy
#     &SitePcrmarkerRange
#   &ProgOrf
#   &ProgProtPlot
#
# - sequence set curation and processing
#   &ProgStatist
#   &ProgStatLen
#   &ProgStatSmb
#   &ProgRandOrder
#   &ProgRandFragment
#   &ProgRandSegment
#     &ProgRandGauss
#   &ProgUniqueID
#   &ProgBlastDb
#
#
#  STD OPTIONS
#
#   -debug      print debug protocol to STDERR
#   -timer      print time-performance protocol to STDERR
#
################################################################################
#
#  DEBUG, CHANGES, ADDITIONS
#
# - some program modes should move to other bundle scripts or stand-alone
#   script
#   -CutMotif to SeqMotif.pl, as -RestrCut
#
# - switch -OutImg=S won't work right in multi-sequence processing
#   instead, treat the argument S like in -OutStump=S. Have a look at functions:
#   &ProgLowcplx
#   &ProgOrf
#
# - look also for notes in the header of each function block
#
################################################################################


# global constants and variables
# also used by modules as a unique global data anchor
our %GlobStore;

# include path(s), includes
BEGIN {
  unshift @INC, grep{$_} split(/:+/,$ENV{KPERLPATH}||$ENV{PERLPATH}||'');
}
use strict; #use warnings;  # OK 20061011
use FileHandle;
use GD;  # this is not part of standard Perl distribution
  if ($GD::VERSION < 1.20 and ! $main::GlobStore{GdWarn}) {
    printf STDERR "WARNING: GD v%s doesn't support png\n", $GD::VERSION;
    $main::GlobStore{GdWarn} = 1;
  }
use MainLib::StrRegexp;
use MainLib::Data;
use MainLib::Path;
use MainLib::Cmdline qw(&GetoptsNArgs &QueryConfirm &GetWriteHandle);
use MainLib::File;
use MainLib::FileTmp qw(&PathUnique);
use MainLib::Graphics;
use MainLib::Internet qw(&HtmlCharEncode);
use MainLib::Misc;
use Math::Round qw(&nearest);
use Math::Calc;
use Math::Range;
use Math::Random;
use Math::Statist qw(&DistribEmpir &SampleMean &GaussLimit);
use Math::Plot2D;
use Math::PlotImg;  # this is depending on library GD
use database::DbPlain;
use database::Table qw(&TableConvert);
use SeqLab::SeqBench;
use SeqLab::SeqFormat;
use SeqLab::SeqStreamIn;
use SeqLab::SeqStreamOut;
use SeqLab::SeqAnalysis;
use SeqLab::MotifLib;
use SeqLab::MotifRE;  # outdated, but extensively used


# script ID
# - program name as specified on the command line
our $ProgFile = ( split('/',__FILE__) )[-1];
our $ProgFstump=$ProgFile; $ProgFstump=~s/\.\w{1,4}$//;

# global constants (esp. default values)
our %ProgParam;
$ProgParam{default}{AnnotLbl}{extern} = 'ENZ9';
$ProgParam{default}{AnnotLbl}{motif} = 'OLIG';
$ProgParam{default}{AnnotLbl}{lowcplx} = 'REPT';
$ProgParam{default}{BlastDbType} = 'NCBI 1.4';
$ProgParam{default}{BreakLen} = 1000;
$ProgParam{default}{BreakAssemblyOlap} = 5;
$ProgParam{default}{LowcplxUnit} = '1..6';
$ProgParam{default}{OutImgRelhigh}{lowcplx} = 0.20;
$ProgParam{default}{OutImgRelhigh}{orf} = 0.09;
$ProgParam{default}{OutImgRelhigh}{ProtPlot} = 0.25;
$ProgParam{default}{OutImgRelhigh}{StatLen} = 0.6;
$ProgParam{default}{OutImgWidth} = 640;
$ProgParam{default}{OutSeqSort} = 'id';
$ProgParam{default}{ProgMode} = 'cat';
$ProgParam{default}{SitePcrprodMin} = 300;
$ProgParam{default}{SitePcrprodMax} = 2000;
$ProgParam{default}{SitePrimerSize} = 30;
$ProgParam{default}{SitePrimerGcExtreme} = 0.28;
$ProgParam{default}{SiteHappyEnds} = 25000;
$ProgParam{default}{ThreshOlap} = 100;
$ProgParam{default}{WinSize}{lowcplx} = 20;

$ProgParam{MaskLen}  = 8;
$ProgParam{MaskChar} = undef;  # see SeqLab::SeqQueueIn.pm

# working desk
$ProgParam{store} = undef;


# manage I/O #####################################

# organise I/O handles
$ProgParam{handle}{tab} = \*STDOUT;
&Unbuffer();

# organise tmp files
# - encapsulation in a sub{} allows MainLib::FileTmp to identify the calling
#   package
my $pcFT = sub{ $ProgParam{TmpManag} = MainLib::FileTmp->new(); };
&$pcFT;


# command line interface #########################
# NOTE:
# - &GetoptsNArgs (via &AddSwitch) modifies global variables:
#   $ProgMode @ProgArg %ProgOpt
# - @ProgArg may be pre-filled in &GetoptsNArgs via &AddSwitch (option -fofn)

# arguments, switches, default subprogram
our $ProgMode = undef;
our @ProgArg = ();
our %ProgOpt = ();
unshift @ProgArg, &GetoptsNArgs();
$ProgMode ||= $ProgParam{default}{ProgMode};

# eventually open LOG file
if ($ProgOpt{-log}) {
  $ProgOpt{LogFile} = ($ProgOpt{-log}!=1) ? $ProgOpt{-log} : undef;
  &LogOpen (-file=>$ProgOpt{LogFile}, -stamp=>$ProgFstump, -prog=>"$ProgFile -$ProgMode");
}
END {
  $ProgOpt{-log} and &LogClose();
}

# initialise sequence output queue
unless ($ProgMode =~ m/^(h(elp)?$|mass$|Motif(Lib|Seq)$|ProtPlot$|stat)/i) {
  $ProgParam{store}{queue}{SeqOut} = SeqLab::SeqStreamOut->new(
      $ProgOpt{-OutDir} ?
    (-dir => $ProgOpt{-OutDir}) : (),
      $ProgOpt{-OutSeq} ?
    (-file => $ProgOpt{-OutSeq}) : (),
      $ProgOpt{-OutSeqFmt} ?
    (-format => $ProgOpt{-OutSeqFmt}) : (),
      $ProgOpt{-OutIdFmt} ?
    (-IdFormat => $ProgOpt{-OutIdFmt}) : (),
      ($ProgOpt{-OutSeqFmt}||''=~m/pretty/i and $ProgOpt{-var}{PosRef}) ?
    (-PosRef => $ProgOpt{-var}{PosRef}) : (),
      $ProgOpt{-OutSeqSort} ?
    (-sort => $ProgOpt{-OutSeqSort}) : (),
      $ProgOpt{-debug} ?
    (-debug => $ProgOpt{-debug}-1) : (),
    );
}


# work flow manifold #############################

# chain to program mode which is optionally argument-less
if (0) { }
elsif (!@ARGV or $ProgMode=~m/^h(elp)?$/i) { &usage() }
elsif ($ProgMode =~ m/^cpg/i) {
  printf STDERR "ERROR: function has moved to SeqMotif.pl\n";
  exit 1;
}
elsif ($ProgMode =~ m/^MotifLib=/i) {
  printf STDERR "ERROR: function has moved to SeqMotif.pl\n";
  exit 1;
}
elsif ($ProgMode =~ m/^MotifSeq=/i) {
  printf STDERR "ERROR: function has moved to SeqMotif.pl\n";
  exit 1;
}
elsif ($ProgMode =~ m/^(Tupel|Tuple)/i) {
  printf STDERR "ERROR: function has moved to SeqMotif.pl\n";
  exit 1;
}

# ensure input argument(s)
unless (@ProgArg) {
  print  STDERR "ERROR: input arguments missing\n";
  exit 1;
}
# ensure input argument(s)
my $arg;
foreach $arg (@ProgArg) {
  unless ( $arg eq '-' or -s &PathExpand($arg) ) {
    printf STDERR "WARNING: input file %s does not exist or has zero length\n", $arg||"''";
  }
}

# initialise sequence input queue
$ProgParam{store}{queue}{SeqIn} = SeqLab::SeqStreamIn->new(@ProgArg);
unless ($ProgParam{store}{queue}{SeqIn}) {
  printf STDERR "ERROR: unable to initialise sequence input queue\n";
  exit 1;
}
$ProgParam{store}{queue}{SeqIn}->AddSwitch (
  -ClipQuality => $ProgOpt{-ClipQuality},
  -ClipUnk     => $ProgOpt{-ClipUnk},
  -fast        => ($ProgMode =~ m/^(stat|BlastDB\b|orf|tupel|tuple)/i
                  and !$ProgOpt{-ClipQuality} and !$ProgOpt{-ClipUnk}) ?
                    1 : undef,
  -FilterDescr => $ProgOpt{-FilterDescr},
  -FilterID    => $ProgOpt{-FilterID},
  -lower       => $ProgOpt{-lower},
  -MatchID     => $ProgOpt{-MatchID},
  -pure        => $ProgOpt{-pure} ||
                  (($ProgMode =~ m/^(bisulfite$|BlastDB\b|(Cat|List)Cds$|CutMotif|Lowcplx$|MaskLowcplx$|mass|ORF$|ProtPlot$|Rand|retr(ansl)?|Site|stat|tr(ansl)?)/i) ? 1 : undef),
                    # don't enter 'break' in the preceeding line (both 'break' and 'BreakIntoAssembly' flow into $ProgMode='break')
  -SlcDescr    => $ProgOpt{-SlcDescr},
  -SlcEnds     => ($ProgMode =~ m/site/i) ? undef : $ProgOpt{-SlcEnds},
  -SlcID       => $ProgOpt{-SlcID},
  -SlcLen      => $ProgOpt{-SlcLen} ||
                  (($ProgMode =~ m/^BlastDB=/i) ? 1 : undef),
  -SlcType     => $ProgOpt{-SlcType},
  -upper       => $ProgOpt{-upper} ||
                  (($ProgMode =~ m/^(bisulfite$|mass$)/i) ? 1 : undef),
  -debug       => $ProgOpt{-debug} ? $ProgOpt{-debug}-1 : undef,
  );

# chain to program mode (with input argument(s))
if (0) { }
elsif ($ProgMode =~ m/^AnnotExpand$/i) {
  undef @ProgArg;
  while (defined ($arg=&SeqQueue())) {
    &ProgAnnotExpand ($arg);
  }
}
elsif ($ProgMode =~ m/^AnnotLowcplx(=([0-9.]+)(,([0-9]+))?)?$/i) {
  $ProgOpt{-LowcplxID} = $2;
  $ProgOpt{-LowcplxLen} = $4;
  undef @ProgArg;
  while (defined ($arg=&SeqQueue())) {
    &ProgLowcplxAnnot ($arg);
  }
}
elsif ($ProgMode =~ m/^Annot(Lwr|Upr)=([\w,]+)$/i) {
  $ProgOpt{AnnotList} = [ split (/,/, $2) ];
  $ProgOpt{AnnotCase} = $1;
  $ProgOpt{AnnotCase} =~ s/lwr/lower/i;
  $ProgOpt{AnnotCase} =~ s/upr/upper/i;
  undef @ProgArg;
  while (defined ($arg=&SeqQueue())) {
    &ProgAnnotCase ($ProgOpt{AnnotCase}, $arg);
  }
}
elsif ($ProgMode =~ m/^AnnotRen$/i) {
  # optionally, multiple statements have been pre-processed to yield
  # %{$ProgOpt{AnnotRen}} to be used as a translation table

  undef @ProgArg;  # already stored in SeqStreamIn
  while (defined ($arg=&SeqQueue())) {
    &ProgAnnotRen ($arg);
  }
}
elsif ($ProgMode =~ m/^AnnotRrna$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  &ProgAnnotRrnaConst();
  while (defined ($arg=&SeqQueue())) {
    &ProgAnnotRrna ($arg);
  }
}
elsif ($ProgMode =~ m/^bisulfite$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  while (defined ($arg=&SeqQueue())) {
    &ProgBisulfite ($arg);
  }
}
elsif ($ProgMode =~ m/^BlastDB=(\S+)$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  $ProgMode = 'BlastDB';
  $ProgOpt{-db} = $1;
  if ($ProgOpt{-db} =~ m|/|) { $ProgOpt{-db} = &PathExpand($ProgOpt{-db}); }
  unless ($ProgOpt{-SlcLen}) {
    $ProgParam{store}{queue}{SeqIn}->AddSwitch (-SlcLen=>1);
  }
  &ProgBlastDb();
}
elsif ($ProgMode =~ m/^break$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  while (defined ($arg=&SeqQueue())) {
    &ProgBreak ($arg);
  }
  if ($ProgOpt{-OutSeqFmt} eq 'Experiment') {
    print  "writing fofn (use it to import directed assembly)\n";
    if (@{$ProgParam{store}{SeqID}||[]}) {
      &WriteFile (($ProgOpt{-OutDir}||&PathCwd()) .'/fofn', join ("\n",
        @{$ProgParam{store}{SeqID}}, ''));
    } else {
      printf STDERR "WARNING: empty output\n";
    }
  }
}
elsif ($ProgMode =~ m/^(?:cat|(ListID)s?)$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  if ($1) { $ProgParam{store}{queue}{SeqOut}->AddSwitch(-format=>'ID') }
  while (defined ($arg=&SeqQueue())) {
    $ProgParam{store}{queue}{SeqOut}->Push($arg);
  }
}
elsif ($ProgMode =~ m/^CatAnnotSeq=([\w,]+)$/i) {
  $ProgOpt{AnnotList} = [ split(/,/,$1) ];
  undef @ProgArg;  # already stored in SeqStreamIn
  while (defined ($arg=&SeqQueue())) {
    if ($ProgOpt{AnnotList}[0] eq 'IntgenBact') {
      &ProgIntgenBactCatseq ($arg);
    } elsif ($ProgOpt{AnnotList}[0] eq 'IntgenEukar' or $ProgOpt{AnnotList}[0] eq 'IntgenDicty') {
      &ProgIntgenDictyCatseq ($arg);
    } elsif ($ProgOpt{AnnotList}[0] eq 'Intron1st') {
      &ProgIntron1stCatseq ($arg);
    } else {
      &ProgAnnotCatseq ($arg);
    }
  }
}
elsif ($ProgMode =~ m/^CatCds$/i) {
  &ProgCatCds();
}
elsif ($ProgMode =~ m/^concat(?:=(\d+))?$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  $ProgOpt{-concat} = $1;
  &ProgConcat();
}
elsif ($ProgMode =~ m/^CutMotif$/i) {
  undef @ProgArg;
  unless ($ProgOpt{-motif}) {
    print  STDERR "ERROR: specify a restriction enzyme with syntax -CutMotif=<enzyme>\n";
    exit 1;
  }
  while (defined ($arg=&SeqQueue())) {
    &ProgCutMotif ($arg);
  }
}
elsif ($ProgMode =~ m/^CutHTGS$/i) {
  undef @ProgArg;
  while (defined ($arg=&SeqQueue())) {
    &ProgCutHTGS ($arg);
  }
}
elsif ($ProgMode =~ m/^GffToExper$/i) {
  &ProgGffToExper (@ProgArg);
}
elsif ($ProgMode =~ m/^IdChg=(.+)/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  $ProgMode =~ m/^IdChg=((\S+) +(.*))$/i or do {
    printf STDERR "%s(IdChg). ERROR: unable to interpret switch argument syntax: \n",
      join ('',__PACKAGE__,', line ',__LINE__), $1;
    exit 1;
  };
  &ProgIdChg ($2, $3);
}
elsif ($ProgMode =~ m/^ListLen$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  while (defined ($arg=&SeqQueue())) {
    printf "%s\n", length ($arg->{sequence});
  }
}
elsif ($ProgMode =~ m/^ListNblock$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  while (defined ($arg=&SeqQueue())) {
    &ProgListNblock ($arg);
  }
}
elsif ($ProgMode =~ m/^Lowcplx(=(.+))?$/i) {
  $ProgOpt{-LowcplxUnit} = $2;
  undef @ProgArg;  # already stored in SeqStreamIn
  &ProgLowcplx (@ProgArg);
}
elsif ($ProgMode =~ m/^Mask(Annot|Tag)(=(.+))?$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  $ProgMode = 'MaskAnnot';
  unless (@{$ProgOpt{-MaskAnnot}} = split (/,/, $3)) {
    print  STDERR "ERROR: missing list of annotation labels for program mode $ProgMode\n";
    exit 1;
  }
  while (defined ($arg=&SeqQueue())) {
    &ProgAnnotMask ($arg);
  }
}
elsif ($ProgMode =~ m/^MaskLowcplx(=([0-9.]+)(,(\d+))?)?$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  $ProgOpt{-LowcplxID} = $2;
  $ProgOpt{-LowcplxLen} = $4;
  $ProgMode = 'MaskLowcplx';
  while (defined ($arg=&SeqQueue())) {
    &ProgLowcplxMask ($arg);
  }
}
elsif ($ProgMode =~ m/^mass$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  while (defined ($arg=&SeqQueue())) {
    my $mass = &SeqMass ($arg, -strands=>$ProgOpt{-strands});
    if (defined ($mass)) { printf "%s\t%s\n",$arg->{id},$mass }
  }
}
elsif ($ProgMode =~ m/^ORF$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  while (defined ($arg=&SeqQueue())) {
    &ProgOrf ($arg);
  }
}
elsif ($ProgMode =~ m/^ProtPlot$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  while (defined ($arg=&SeqQueue())) {
    &ProgProtPlot ($arg);
  }
}
elsif ($ProgMode =~ m/^RandFrag(?:ment)?$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  &ProgRandFragment();
}
elsif ($ProgMode =~ m/^RandOrder$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  &ProgRandOrder();
}
elsif ($ProgMode =~ m/^RandSegment$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  &ProgRandSegment();
}
elsif ($ProgMode =~ m/^range(?:=(.+))?$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  unless ($ProgOpt{-RangeArg} = $1) {
    print  STDERR "ERROR: missing range argument in program mode -range=S\n";
    exit 1;
  }
  $ProgMode = 'range';
  while (defined ($arg=&SeqQueue())) {
    if ($arg = &SeqCplxRange ($arg, $ProgOpt{-RangeArg}, -debug=>$ProgOpt{-debug})) {
      $ProgParam{store}{queue}{SeqOut}->Push($arg);
    }
  }
}
elsif ($ProgMode =~ m/^retr(?:ansl(?:ate)?)?$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  $ProgMode = 'retranslate';
  while (defined ($arg=&SeqQueue())) {
    &ProgRetransl ($arg);
  }
}
elsif ($ProgMode =~ m/^RevCompl$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  while (defined ($arg=&SeqQueue())) {
    &ProgRevcompl ($arg);
  }
}
elsif ($ProgMode =~ m/^SiteHappy$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  while (defined ($arg=&SeqQueue())) {
    &ProgSiteHappy ($arg);
  }
}
elsif ($ProgMode =~ m/^stat(ist)?$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  &ProgStatist();
}
elsif ($ProgMode =~ m/^StatEntry$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  &ProgStatSmb (-individ=>1);
}
elsif ($ProgMode =~ m/^StatLen$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  &ProgStatLen();
}
elsif ($ProgMode =~ m/^StatSmb$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  &ProgStatSmb();
}
elsif ($ProgMode =~ m/^temp$/i) {
  &ProgTemp();
}
elsif ($ProgMode =~ m/^tr(ansl(ate)?)?$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  $ProgMode = 'translate';
  while (defined ($arg=&SeqQueue())) {
    &ProgTransl ($arg);
  }
}
elsif ($ProgMode =~ m/^UDG$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  while (defined ($arg=&SeqQueue())) {
    &ProgUDG ($arg);
  }
}
elsif ($ProgMode =~ m/^Unique(ID)$/i) {
  undef @ProgArg;  # already stored in SeqStreamIn
  while (defined ($arg=&SeqQueue())) {
    &ProgUniqueID ($arg);
  }
}
else {
  print  STDERR "ERROR: unknown program mode or switch '$ProgMode'\n";
  exit 1;
}

# log overall time and space requirements at runtime
if ($ProgOpt{-timer}) {
  print  STDERR "overall time and space requirements at runtime\n";
  printf STDERR "  CPU time: %.2f s\n  CPU time incl. system: %.2f s\n  total CPU time of ps family: %.2f s\n",
    (times)[0], &Sum((times)[0,2]), &Sum(times);
  printf STDERR "  memory: %d kByte\n", &MyMem;
}

# exit script successfully
# cf. END blocks!
exit 0;


################################################################################
# usage help, command line arguments
################################################################################


sub usage {
  print "\n";
  print <<END_USAGE;
DESCRIPTION
 $ProgFile is designed for sequence and sequence file/database handling.
 The main purpose is to provide an command line interface to the functions
 available in SeqLab::SeqFormat.pm, SeqLab::SeqBench.pm, SeqLab::SeqAnalysis.pm,
 SeqLab::MotifRE.pm.
 This program does not intend to cover alignment functionalities. See Align.pl
 for this.

COMMAND LINE SYNTAX
 $ProgFile <ModeSwitch> [<options>] <Arg1> [<Arg2> ...]

Arguments
---------
 Typically, args specify sequence input files. Deviations are stated in the
 listing of ModeSwitches below.

Path Arguments
--------------
 Relative paths will be resolved according to the pwd. Prefixes "~" and "~uid"
 are resolved to the home directories. Path "-" resolves to STDIN or STDOUT,
 depending on the context.
 An input file may be identical to the output file (option -OutSeq=outfile),
 without conflicts, if the file is the first one in the list of input files.
 Another way to avoid input/output clashes is option -OutSeq=reqwrite,
 preserving the associations between files and sequence entries.

File Format Support
-------------------
 Input file format is determined automatically and may be one of:
   Experiment, fastA, GAP4 database, GenBank, GFF, plain, selex, struct,
   table
 The sequence formats are further described in the manual section
 "File Format Support"

 Output is written to STDOUT in standard fastA format or one of:

ModeSwitch (case-insensitive)
-----------------------------
switch argument types: B:=boolean, F:=floating point/scientific, N:=integer,
S:=string, X:=varying type.

<none>            default ModeSwitch -$ProgParam{default}{ProgMode} if program arguments are given.
                  Otherwise like ModeSwitch -help.
-AnnotExpand      expand annotation data structure
                  -OutSeqFmt=struct
                              this output format gives best supports the
                              enriched annotation data.
-AnnotLowcplx(=F(,N))
                  enter annotations for low-complexity ranges.
                  F           minimum mean of repetitive correlation measure
                              per position that must be satisfied in the
                              scanning window (see switch argument N). F has a
                              possible value range of [0,1]. Default is: 0.80
                  N           minimum length of repeat range and size of
                              scanning window in which scoring parameters have
                              to be fulfilled. N has a possible value range of
                              [1,256]. Default: 25
                  -AnnotLbl   annotation label, default: $ProgParam{default}{AnnotLbl}{lowcplx}
-AnnotLwr=S       change annotated sequence to lower case letters. The
                  respective annotations will be kept.
                  S           comma-separated list of annotation labels
                  -AnnotLyr   selector for annotation layer
                  -AnnotOrient
                              selector for annotation orientation
-AnnotRen=S,S     change annotation label from S1 to S2. Multiple switch
                  statements are allowed. Null-length S2 tells the program
                  to delete annotation S1.
                  -AnnotLabel no effect
                  -AnnotOrient
                              *** does not work ***
-AnnotRrna        analyse GenBank sequences for annotated rRNA. Output of data
                  structure format to STDOUT. By default, extensive messaging
                  is done to STDERR.
-AnnotUpr=S       change annotated sequence to upper case letters. The
                  respective annotations will be kept.
                  S           comma-separated list of annotation labels
                  -AnnotLyr   selector for annotation layer
                  -AnnotOrient
                              selector for annotation orientation
-bisulfite        convert sequence according to application of bisulfite method
                  to methylated DNA (100 % CpG methylation assumed) and a
                  successive PCR: every C will turn to a T if not followed by
                  a G.
                  --freqB=F(,F)
                              rel. bisulfitation frequency for non-methylated
                              C nucleotides. Specify global value or array of
                              indivdual values (corresponding to every single
                              C). The program will do a randomisation based on
                              the frequency values.
                  --freqM=F(,F)
                              rel. methylation frequency, analogous to
                              --freqB=...
                  --rept=N    do repeated parallel bisulfite reaction to
                              generate a pool of converted sequences. This is
                              mainly meant to generate a stastically meaningful
                              sample in order to simulate PCR product cloning
                              after the bisulfite reaction.
                  --U=B       use dUTP rather than dTTP in the PCR
-BlastDB=S        press BLAST database.
                  S           database path/name
                              default path: ENV{BLASTDBW} or ENV{BLASTDB}
                  -DbTitle    database title to be displayed in BLAST reports
                  -DbType     database type
                  -OutSeq     where to put the fastA format sequence output
                              generated by pressdb/setdb. Unless specified,
                              this output will be deleted.
                  -OutSeqFmt  this does not work here
                  -pure       purification mode "DNA5" is recommended for nt
                              sequences
-break(=N1(,N2))  break sequence into fragments
                  N1          fragment size (default: $ProgParam{default}{BreakLen})
                  N2          overlap size which is itself part of the fragment
                              size, default: 0.
                  --OlapN=B   in lagging sequence fragments change overlap
                              region to Ns
-BreakIntoAssembly(=N1(,N2))
                  convert sequences to a GAP4 directed assembly
                  N1          fragment size (default: $ProgParam{default}{BreakLen})
                  N2          default: $ProgParam{default}{BreakAssemblyOlap}
-cat              just (re-)output sequences. This can be used for file format
                  conversions etc., cmp. option -OutSeqFmt.
-CatAnnotSeq=S    extract sequence ranges underlaying an annotation. Complex/
                  distributed features (range syntax like "join(1..10,20..29)",
                  like in GenBank format) will be output as a single sequence.
                  S           comma-separated string of annotation labels.
                              Use "magic" annotation labels to request more
                              complex sequence features:
                              "IntgenBact"   bacterial intergenic sequence
                                 oriented in sense to the neighbouring gene
                              "IntgenEukar"  Dictyostelium (and probably
                                 other eukaryotic) intergenic sequence
                              "Intron1st"  1st intron of each transcript
                              These specials work fine with GenBank format
                              input; do not expect these features with other
                              formats.
                  -flank=N    add flanks of specified size to output sequence
                  -SlcValid=0 Does not work here. By default, pseudogenes are
                              output.
                  --gap=N     join sequence of two annotated ranges if they are
                              separated by less than the specified gap length,
                              default: never join sequences of different ranges
                  --nonred=B  for magic annotation features "Intgen*":
                              intergenic ranges of category 5',5' are normally
                              output doubly. Set this switch true in order to
                              remove this redundancy.
                  --uplow=B   turn annotated sequence upper case, flanking
                              sequence lower case.
-CatCds           output CDS sequences. If possible, try to use function
                  -CatAnnotSeq=CDS instead. Here, output is restricted
                  to fastA format. In input formats CDS may be annotated with
                  labels: [eE]xon, First, Internal, Terminal, Single
                  -SlcValid=0 include pseudogenes in output
-concat(=N)       concatenate sequence input into single sequence entry
                  N           optional: length of poly(N) spacer between joint
                              fragments, default: 0
-CutHTGS          cut HTGS 1 sequence(s) into contributing fragments
                  Assuming that fusion is done by >=100 Ns.
                  --numN=N    minimum number of Ns that provoke split. For
                              technical reasons arg N cannot exceed 32766.
-CutMotif=S       cut sequence(s) with restriction enzyme S.
                  S           motif defined in restriction motif library.
                              Motif library needs to be installed under
                              \$ENV{MOTIFPATH}/$CorePath{motif}{restric}
-GffToExper       convert GFF-formatted annotation to Experiment file format
                  tags. Generally, you can try:
                    $ProgFile -cat <GFF_input> -OutSeqFmt=Experiment.
                  But, there's the problem that GAP4 doesn't support annotation
                  labels that are more than 4 characters long. This script
                  function regards this labelling rule and it also meets
                  some GSCJ-specific annotation conventions.
                  Arg1+       input *.gff file(s)
-h(elp)           output command line syntax description and exit
-IdChg='S1 S2'    change sequence identifiers
                  S1          search (regexp)
                  S2          replace
                  you will have to put quotes around the option in order to
                  specify S2. A space as a delimiter makes sense here, since it
                  is a forbidden character in an identifier.
-ListID           list sequence identifiers
-ListNblock       list blocks of Ns, e.g. HTGS fragment separators. Output is
                  a list of values: seqID / offset / end / length, where
                  offset/end refer to the (possibly) gapped sequence, and length
                  refers to the gapped sequence.
                  --numN=N    minimum size of N block to be reported. For
                              technical reasons arg N cannot exceed 32766.
-ListLen          list sequence lengths
-Lowcplx(=S)      plot local low complexity score(s)
                  S           unit size range, default: $ProgParam{default}{LowcplxUnit}
                  -OutImg(=S) prepare image (S: path, default derived from
                              input path)
                  -OutImgRelhigh=F
                              image height in relation to width, default: $ProgParam{default}{OutImgRelhigh}{lowcplx}
                  -WinSize    window size for smoothening, default: $ProgParam{default}{WinSize}{lowcplx}
-MaskAnnot=S      replace annotated sequence ranges by Unknowns. The respective
                  annotations will be deleted.
                  S           comma-delimited list of the annotation labels
                              that are meant to be regarded in masking.
-MaskLowcplx(=F(,N))
                  replace low-complexity ranges by Unknowns.
                  Prior to low-complexity analysis, the sequence string will be
                  purified (cmp. ModeSwitch -pure) and therefore the sequence
                  string in output will not contain other than sequence-encoding
                  letters.
                  F           minimum mean of repetitive correlation measure
                              per position that must be satisfied in the
                              scanning window (see switch argument N). F has a
                              possible value range of [0,1]. Default is: 0.80
                  N           minimum length of repeat range and size of
                              scanning window in which scoring parameters have
                              to be fulfilled. N has a possible value range of
                              [1,256]. Default: 25
                  CAUTION: This program function performs very slowly on long,
                  gapped input sequences!
-mass             list Dalton masses of sequences
                  -strands=N  strandedness of nucleotide sequence
-ORF              perform ORF analysis on purified input sequence(s). Output
                  is a table reporting possible translation starts and
                  resulting protein sequences.
                  -OutImg(=S) prepare image (S: path, default derived from
                              input)
                  -OutImgRelhigh=F
                              image height in relation to width, default: $ProgParam{default}{OutImgRelhigh}{orf}
-ProtPlot         plot local protein sequence properties like: acidity,
                  hydrophobicity.
                  -OutImgRelhigh=F
                              image height in relation to width, default: $ProgParam{default}{OutImgRelhigh}{ProtPlot}
-RandOrder        randomise sequence entries
-RandFrag         concatenate input, randomly pick fragments (default size
                  1 kbp). Sequence IDs will be "RandFragment\\d".
                  --size=N    absolute size of randomised fragments, default
                              1 kbp
                  --sd=F      standard deviation for fragment size randomization
                  --spacesd=F standard deviation for randomized fragment spacing
-RandSegment      concatenate input, randomly pick fragments (default size
                  1 kbp), output segments of concatenated fragments (default:
                  two segments of same size). Sequence IDs will be
                  "RandSegment\\d".
                  --join=S    join fragments by sequence string S, default none.
                              This option allows to track the fusion sites.
                  --size=N    absolute size of randomised fragments, default
                              1 kbp
                  --sd=F      standard deviation for randomization of fragment
                              size. Highly recommended.
                  --seg=F1(,F2,...)
                              relative sizes (F1,F2,...) and effective number
                              of concatenated segments, default: two segments
                              of same size.
-range=S          extract sequence range according to range argument S. The
                  syntax of the range argument is based on the GenBank
                  annotation range syntax, e.g.:
                  - 'complement(join(1..20,41..60))'
                  - '1..-61'
                    A negative position value in the range argument refers to
                    the sequence end.
-retr(anslate)    perform protein to nucleotide re-translation
-RevCompl         do reverse-complement translation for sequence(s).
-SiteHappy        select HAPPY marker sites
                  -SlcEnds    select multiple marker sites if sequence size
                              exceeds this value, default: $ProgParam{default}{SiteHappyEnds}
-stat(ist)        statistics on sequence input - laconic version
-StatEntry        like -StatSmb but makes statistics on each individual
                  sequence entry
-StatLen          sequence length statistics on sequence input
                  -OutImg(=S) prepare graphical representation of sequence
                              length distribution. Argument S has the same
                              effect as in -OutStump=S
                  -OutImgRelhigh=F
                              image height in relation to width, default: $ProgParam{default}{OutImgRelhigh}{StatLen}
                  -OutStump=S output path stump in conjunction with -OutImg*
-StatSmb          symbol statistics on sequence input
-tr(anslate)      perform nucleotide to protein translation
-UDG              convert sequence according to uracil DNA glycosylase (UDG)
                  reaction and a successive endonuclease treatment
-UniqueID         produce sequence output that is non-redundant for sequence
                  identifiers. The first encountered entry of an ID doublette
                  will be kept for output.

OptionalSwitch (case-insensitive)
---------------------------------
switch argument types: B:=boolean, F:=floating point/scientific, N:=integer,
S:=string, X:=varying type.

-AnnotLbl=S       specify an annotation label
-AnnotLyr=S       specify an annotation layer
-AnnotOrient=S    specify an annotation orientation: +, -, = (undefined),
                  . (all)
-ClipQual         clip off minor quality range from sequence, e.g. in
                  Experiment file format
-ClipUnk          clip unknown symbols at the end of the sequence string
-DbTitle=S        specify a database title, e.g. in program mode -BlastDB
-DbType=S         database type, program mode -BlastDB
                  NCBI 1.4    BLAST NCBI 1.4, WU-BLAST 2.0 (default)
                  NCBI 2.0    BLAST NCBI 2.0
-debug(=N)        print debug protocol to STDERR (sometimes STDOUT) and keep
                  temporary files
-FilterDescr=S    apply regexp to sequence description and ignore matching
                  sequence input
-FilterID=S       apply regexp to sequence IDs and ignore matching sequence input
-flank=N          size of flanking sequence
-fofn=S           supply a list of command arguments in a file
-log(=S)          redirect STDOUT and STDERR to LOG file
                  S           log file path, default path worked out
                              automatically from built-in directives.
-lower            force input sequence to lower case letters
-MatchID=S        specify a file containing a list of identifiers that shall be
                  used for sequence entry selection.
-MtfSlcID=S       select motifs by motif identifier (regexp)
-MtfSlcKey=S      select motifs by keyword entry (regexp)
-OutDir=S         directory for multi-file output. This switch overrides any
                  directory statement provided with switch -OutStump.
-OutIdFmt=S       modify sequence identifiers prior to output, case-insens-
                  itive option argument:
                  acc         try to extract bare Acc.Nos. from complex identi-
                              fier fields
                  acc2        try to extract Acc.Nos. from complex identifier
                              fields, including version suffix (but not database
                              prefix)
                  acc3        try to extract Acc.Nos. from complex identifier
                              fields, including database prefix and version
                              suffix
                  gi          try to extract gi numbers from complex identifier
                              fields
                  gidel       delete gi number component from complex identifier
                              fields
                  word        try to extract bare Acc.No. or first word from
                              complex identifier field. The danger is that
                              sequence identifiers turn non-unique.
                  Program mode -IdChg=S offers the most flexible function to
                  modify sequence identifiers.
-OutImg(=S)       force program to produce image output and (optionally)
                  specify output path
-OutImgRelhigh=F  define the output image's height in relation to its width
-OutImgTransp     turn image's background transparent
-OutImgWidth=N    define the output image's pixel width, default $ProgParam{default}{OutImgWidth}
-OutSeq=S         file path for sequence output, default: STDOUT. For multi-
                  file output use switch -OutDir and -OutStump.
-OutSeq=rewrite   preserve the file structure as in sequence input. Possible
                  combination with switches:
                  -OutSeqFmt  rewrite in specified file format
                  -OutDir     rewrite files into specified directory
-OutSeq=SingleSeq write single-sequence output with filenames identical to
                  sequence ID
-OutSeqFmt=S      format for sequence output, case-insensitive:
                  Experiment, fastA (default), FeatureTable, GenBank, GFF,
                  plain, PrettyHTML, selex, struct, table.
-OutSeqSort(=S)   sort sequence output ascending for criterion S which may be
                  "id" (default) or "descr"
-OutStump=S       path stump for multi-file output. A default is derived from
                  input file names in most cases.
-OutTab=S         file path for tabular output
-OutTabFmt=S      format for tabular output, case-sensitive:
                  gff         GFF format
                  html        HTML format
                  tab         TAB-delimited (default)
-pid=S            output pid to file
-pure(=S)         purify input sequence strings to leave letters which conform
                  with the sequence alphabet
-SlcDescr=S       select using RegExps match to sequence description
-SlcEnds=N        select sequence ends having the specified length N bp/aa. A
                  sequence smaller than two times this value will be taken
                  completely. A switch argument value lower than 1 will be
                  ignored.
-SlcFrame=S       restrict analysis to specified frames. Supply a comma-
                  delimited list including the special syntax:
                  + / plus    all plus frames
                  - / minus   all minus frames
-SlcID=S          apply RegExp to sequence IDs and ignore non-matching sequence
                  input
-SlcKnown=N       select sequences from input which have an at least n bp
                  spanning continuous non-masked sequence range. Masked
                  ranges are defined by poly(N) / poly(X) having a length
                  of >= $ProgParam{MaskLen} bp / aa.
-SlcLen=N1(..N2)  select input sequences according to their length
                  N1          minimum length
                  N2          maximum length, default: no limit
-SlcScore=F|N     select results by minimal score.
-SlcType=S        select for sequence type S. Type may be "DNA" / "protein".
-SlcValid=N       select for valid and consistent results, default: 1
-strands=N        specify strandedness of sequence. This affects annotation
                  selection, for example.
                  0           default: treat sequence as double-stranded
                  -1          treat sequence as single-stranded (anti-sense)
                  1           treat sequence as single-stranded (sense)
                  The available strandedness models are described in more
                  detail in the document
                  $CorePath{call}{MeInstDoc}/SeqStrands.txt
-timer            print time-performance protocol to STDERR
-upper            force input sequence to upper case letters
-v(erbose)        print extended protocol to STDOUT. You will need to set
                  option -OutSeq=S if you require valid sequence output.
-WinSize=N        specify window size for smoothening etc.
--*               program mode-specific switches. See the descriptions there.
                  Case-sensitive!

Environment Variables
---------------------
 \$PERLPATH        primary search path for Perl package look-up
 \$TEMPPATH        directory for storage of temporary files, default /tmp

Temporary Files
---------------
 The program may generate temporary files. These will be placed either in a
 directory specified by \$ENV{TEMPPATH} or in /tmp.

Further Reading
---------------
 A detailed, though incomplete, manual is available at
 $CorePath{call}{MeInstDoc}/$CorePath{man}{SeqHandle} .
END_USAGE
  print "\n";
  exit 1;
}


# add program switches to global table (hash)
#
# INTERFACE
# - argument 1:  switch argument without leading '-'
#
# - global options:
#   -debug       [STD]
#
# - global data:
#   $ProgMode
#   @ProgArg     may be pre-filled here beside it's typically used to store
#                return value from &GetoptsNArgs via &AddSwitch (option
#                -fofn)
#   %ProgOpt  switch data which gets processed here
#
# DESCRIPTION
# - this function gets called by &MainLib::Misc::GetoptsNArgs
# - switch arguments are tested for validity. Arguments are parsed with highest
#   possible tolerance. This way, syntax errors can reported in accordance to
#   the actual switch, rather than reporting ANY syntax error.
#
sub AddSwitch {
  my ($switch) = shift;
  my ($debug, $SwitchArg);
  my ($pTable, $pSlc, $pMtfLib);

  # function parameters
  $debug = $ProgOpt{-debug};

  # optional switches
  if ($switch =~ m/^(?:Annot(?:Label|Lbl)|tag)(?:=(.+))?$/i) {
    if ($1 and int(grep{ m/gap/i } @ARGV) and length($1)!=4) {
      printf STDERR "WARNING: unusual annotation label %s (switch -AnnotLbl)\n", $1;
    }
    $ProgOpt{-AnnotLbl} = $1;
    return;
  }
  if ($switch =~ m/^AnnotLyr=(.+)$/i) {
    $ProgOpt{-AnnotLyr} = $1;
    return;
  }
  if ($switch =~ m/^AnnotOrient=(.+)$/i) {
    $SwitchArg = $1;
    if ($SwitchArg !~ m/^[-+=.]$/) {
      printf STDERR "ERROR: unknown syntax for annotation orientation: %s\n", $SwitchArg;
    } else {
      $ProgOpt{-AnnotOrient} = $SwitchArg;
    }
    return;
  }
  if ($switch =~ m/^AnnotRen=(.+?)(,(.*))?$/i) {
    # multiple statements allowed
    $ProgOpt{AnnotRen}{$1} = $3||'';
    $ProgMode ||= 'AnnotRen';
    return;
  }
  if ($switch =~ m/^ClipQual(ity)?$/i) {
    $ProgOpt{-ClipQuality} = 1;
    return;
  }
  if ($switch =~ m/^ClipUnk$/i) {
    $ProgOpt{-ClipUnk} = 1;
    return;
  }
  if ($switch =~ m/^DbTitle=(.+)$/i) {
    $ProgOpt{-DbTitle} = $1;
    return;
  }
  if ($switch =~ m/^DbType=(.+)$/i) {
    $ProgOpt{-DbType} = uc $1;
    return;
  }
  if ($switch =~ m/^debug(=(\d+))?$/i) {
    $ProgOpt{-debug} = defined($2) ? int($2) : 1;
    return;
  }
  if ($switch =~ m/^FilterDescr=(.+)$/i) {
    $ProgOpt{-FilterDescr} = $1;
    return;
  }
  if ($switch =~ m/^FilterID=(.+)$/i) {
    $ProgOpt{-FilterID} = $1;
    return;
  }
  if ($switch =~ m/^flanks?(=(.*))?$/i) {
    $ProgOpt{-flank} = $2;
    unless (int ($2)) {
      printf STDERR "ERROR: value N in switch -flank has to be positive integer, not %s\n",
        $ProgOpt{-flank}||"''";
      exit 1;
    }
    return;
  }
  if ($switch =~ m/^fofn=(.+)$/i) {
    $SwitchArg = ($1 eq '-') ? $1 : &PathExpand($1);
    if ($pTable = &LoadFoid($SwitchArg)) {
      push @ProgArg, @$pTable;
      $debug and printf STDERR "%s. %d entries loaded from fofn %s\n", &MySub, int @$pTable, $SwitchArg||"''";
    } else {
      printf STDERR "ERROR: unable to read entries from file of filenames %s (-> %s)\n",
        $1, $SwitchArg;
      exit 1;
    }
    return;
  }
  if ($switch =~ m/^log(=(.*))?$/i) {
    $ProgOpt{-log} = $2 ? &PathExpand($2) : 1;
    return;
  }
  if ($switch =~ m/^lower$/i) {
    $ProgOpt{-lower} = 1;
    return;
  }
  if ($switch =~ m/^MatchID=(.+)$/i) {
    $SwitchArg = (
      grep { ($_ eq '-') or (-r $_ and ! -d $_ and -s $_) }
      &PathExpand($1), $1 )[0];
    if ($SwitchArg and $pSlc = &LoadFoid($SwitchArg)) {
      $ProgOpt{-debug} and printf STDERR "read %d entr%s from file of ID selectors: %s\n",
        int(@$pSlc), (int(@$pSlc)==1)?'y':'ies', $SwitchArg||"''";
      $ProgOpt{-MatchID} = $pSlc;
    } else {
      die sprintf "ERROR: unable to read file %s (evaluated %s, option -MatchID)\n",
        $1||"''", $SwitchArg||"''";
    }
    return;
  }
  if ($switch =~ m/^M(tf)?(Slc|Select)ID=(.+)$/i) {
    $ProgOpt{-MtfSlcID} = $3;
    return;
  }
  if ($switch =~ m/^M(tf)?(Slc|Select)Key=(.+)$/i) {
    $ProgOpt{-MtfSlcKey} = $3;
    return;
  }
  if ($switch =~ m/^OutDir=(.+)$/i) {
    $ProgOpt{-OutDir} = &PathExpand($1);
    unless (-d $ProgOpt{-OutDir}) {
      if (-e $ProgOpt{-OutDir}) {
        printf STDERR "ERROR: output destination exists, but is not a directory: %s\n", $ProgOpt{-OutDir}||"''";
        exit 1;
      }

      # this dialogue will also appear if there's no output do be done to any
      # file/directory
      else {
        if (int(grep{ $_ eq '-' }@ARGV)) {
          printf STDERR "creating non-existent output directory (skip dialogue in STDIN input mode)\n";
          mkdir ($ProgOpt{-OutDir});
        } else {
          printf STDERR "output directory does not exist, create?";
          if (&QueryConfirm()) {
            mkdir ($ProgOpt{-OutDir});
          } else { exit 1 }
        }
      }
    }
    return;
  }
  if ($switch =~ m/^OutIdFmt=([\w-]+)$/i) {
    $ProgOpt{-OutIdFmt} = ( grep { lc($1) eq lc($_) }
      grep { ref($SeqidFunc{$_}) eq 'CODE' } keys(%SeqidFunc) )[0];
    unless ($ProgOpt{-OutIdFmt}) {
      printf STDERR "ERROR: specified identifier format action %s (opton -OutIdFmt) is not available\n", $1;
      exit 1;
    }
    return;
  }
  if ($switch =~ m/^OutImg(=(.+))?$/i) {
    $ProgOpt{-OutImg} = $2 ? &PathExpand($2) : '';
    return;
  }
  if ($switch =~ m/^OutImgRelhigh=([0-9\.eE+-]+)?$/i) {
    $ProgOpt{-OutImg} ||= '';
    $ProgOpt{-OutImgRelhigh} = $1;
    return;
  }
  if ($switch =~ m/^OutImgTransp(ar)?$/i) {
    $ProgOpt{-OutImg} ||= '';
    $ProgOpt{-OutImgTransp} = 1;
    return;
  }
  if ($switch =~ m/^OutImgWidth=(\d+)?$/i) {
    $ProgOpt{-OutImg} ||= '';
    $ProgOpt{-OutImgWidth} = $1;
    return;
  }
  if ($switch =~ m/^Out(?:Seq|File)=(.+)$/i) {
    $SwitchArg = $1;
    # conserve "-" (as an alias for stdin) and pipe commands
    $ProgOpt{-OutSeq} =
      ($SwitchArg =~ m/^(-$|\s*\|)/
        or int(grep{$_ eq $SwitchArg} @{$SeqLab::SeqStreamOut::LibGlob{FileMagic}})) ?
      $SwitchArg : &PathExpand($SwitchArg);
    return;
  }
  if ($switch =~ m/^OutSeqFmt=([\w-]+)$/i) {
    $ProgOpt{-OutSeqFmt} = ( grep { lc($1) eq lc($_) }
      grep { ref($SeqFFmtOutFunc{$_}) eq 'CODE' } keys(%SeqFFmtOutFunc) )[0];
    unless ($ProgOpt{-OutSeqFmt}) {
      printf STDERR "ERROR: specified sequence output format %s (opton -OutSeqFmt) is not available\n", $1;
      exit 1;
    }
    return;
  }
  if ($switch =~ m/^OutSeqSort(=(\w+))?$/i) {
    $ProgOpt{-OutSeqSort} = $2 || $ProgParam{default}{OutSeqSort};
    unless ($ProgOpt{-OutSeqSort} =~ m/^(descr|id|length)$/) {
      printf STDERR "ERROR: unknown sorting criterion: %s\n", $ProgOpt{-OutSeqSort};
      exit 1;
    }
    return;
  }
  if ($switch =~ m/^OutSt[au]mp=(.+)$/i) {
    $ProgOpt{-OutStump} = &PathExpand($1);
    return;
  }
  if ($switch =~ m/^OutTab=(.+)$/i) {
    $ProgOpt{-OutTab} = ($1 eq '-') ? '&STDOUT' : &PathExpand($1);
    $ProgParam{handle}{tab} = FileHandle->new(">$ProgOpt{-OutTab}");
    unless ($ProgParam{handle}{tab}) {
      printf STDERR "ERROR: unable to open specified table output file %s\n", $ProgOpt{-OutTab}||"''";
      exit 1;
    }
    return;
  }
  if ($switch =~ m/^OutTabFmt=Experiment$/i) {
    printf STDERR "ERROR: no annotation label specified along with -OutTabFmt=Experiment\n";
    exit 1;
  }
  if ($switch =~ m/^OutTabFmt=(\S+)$/i) {
    $ProgOpt{-OutTabFmt} = $1;
    if ($ProgOpt{-OutTabFmt} !~ m/^(Experiment,\w{4}|gff|html|tab)$/i) {
      printf STDERR "ERROR: specified table output format %s is not available\n", $ProgOpt{-OutTabFmt}||"''";
      exit 1;
    }
    if ($ProgOpt{-OutTabFmt} =~ m/^Experiment/) {
      $ProgOpt{-OutSeqFmt} ||= 'Experiment';
    }
    return;
  }
  if ($switch =~ m/^pid=(.+)$/i) {
    $SwitchArg = ($1 eq '-') ? $1 : &PathExpand($1);
    &WriteFile ($SwitchArg, "$$\n");
    return;
  }
  if ($switch =~ m/^pure(=(.+))?$/i) {
    $ProgOpt{-pure} = $2 || 1;
    return;
  }
  if ($switch =~ m/^Select/i) {
    print  STDERR "ERROR: selector switches are now spelled \"-Slc*\"\n";
    exit 1;
  }
  if ($switch =~ m/^SlcDescr=(.+)$/i) {
    $ProgOpt{-SlcDescr} = $1;
    return;
  }
  if ($switch =~ m/^SlcEnds=(\d+)$/i) {
    if ($1 > 0) {
      $ProgOpt{-SlcEnds} = $1;
    } else {
      print  STDERR "WARNING: option -SlcEnds=N does not take effect with N <= 0\n";
    }
    return;
  }
  if ($switch =~ m/^SlcFrame=([-+1-3,plusmin]+)$/i) {
    $ProgOpt{-debug} and printf STDERR "%s. select for frame with argument '%s'\n", &MySub, $1;
    foreach (split (/,/, $1)) {
      if ($_ =~ m/^(\+|plus)$/i) {
        push @{$ProgOpt{-SlcFrame}}, ( '+1', '+2', '+3' );
      }
      if ($_ =~ m/^(-|minus)$/i) {
        push @{$ProgOpt{-SlcFrame}}, ( '-1', '-2', '-3' );
      }
      if ($_ =~ m/^([-+]?)([1-3])$/) {
        push @{$ProgOpt{-SlcFrame}}, ($1||'+') . $2;
      }
    }
    @{$ProgOpt{-SlcFrame}} = sort { $a cmp $b; } @{$ProgOpt{-SlcFrame}};
    return;
  }
  if ($switch =~ m/^SlcID=(.+)$/i) {
    $ProgOpt{-SlcID} = $1;
    return;
  }
  if ($switch =~ m/^SlcKnown=(\d+)$/i) {
    $ProgOpt{-SlcKnown} = $1;
    $ProgOpt{-SlcLen} ||= Math::Range->new($1,undef);
    return;
  }
  if ($switch =~ m/^SlcLen=(.+)$/i) {
    unless ($ProgOpt{-SlcLen} = Math::Range->new_parsed($1)) {
      printf STDERR "ERROR: invalid argument for switch -SlcLen: $1\n";
      exit 1;
    }
    return;
  }
  if ($switch =~ m/^SlcScore=([0-9\.eE+-]+)$/i) {
    $ProgOpt{-SlcScore} = $1;
    return;
  }
  if ($switch =~ m/^SlcType=(.+)$/i) {
    $ProgOpt{-SlcType} = $1;
    return;
  }
  if ($switch =~ m/^SlcValid=(.*)$/i) {
    if ($1 ne '0' and $1 ne '1') {
      printf STDERR "ERROR: invalid argument %s for switch -SlcValid, must be 0/1\n", $1;
      exit 1;
    }
    $ProgOpt{-SlcValid} = $1;
    return;
  }
  if ($switch =~ m/^strands=(-?[12])$/i) {
    $ProgOpt{-strands} = $1;
    return;
  }
  if ($switch =~ m/^timer$/i) {
    $ProgOpt{-timer} = 1;
    return;
  }
  if ($switch =~ m/^upper$/i) {
    $ProgOpt{-upper} = 1;
    return;
  }
  if ($switch =~ m/^v(erbose)?$/i) {
    $ProgOpt{-verbose} = 1;
    return;
  }
  if ($switch =~ m/^WinSize=([\d.]+)$/i) {
    $ProgOpt{-WinSize} = $1;
    return;
  }
  if ($switch =~ m/^(?:-|var=)(\w+)[,=](.+)$/i) {
    $ProgOpt{-var}{$1} = $2;
    return;
  }

  # program mode switches
  if (defined($ProgMode)) {
    printf STDERR "ERROR: multiple specification of program mode or unknown switch, %s and %s\n",
      '-'.($ProgMode||"''"), '-'.($switch||"''");
    exit 1;
  }
  else {

    # ProgMode break: we derive Breaklen & BreakOlap
    if ($switch =~ m/^break(=(\d+)(,(\d+))?)?$/i) {
      $ProgOpt{BreakLen}  = $2 || $ProgParam{default}{BreakLen};
      $ProgOpt{BreakOlap} = int $4;
      $ProgMode = 'break';
      return;
    }

    # ProgMode break: we derive Breaklen & BreakOlap
    if ($switch =~ m/^BreakIntoAssembly(=(\d+)(,(\d+))?)?$/i) {
      $ProgOpt{BreakLen}  = $2 || $ProgParam{default}{BreakLen};
      $ProgOpt{BreakOlap} = length($4) ? $4 : $ProgParam{default}{BreakAssemblyOlap};
      $ProgOpt{-OutSeq} = 'SingleSeq';
      $ProgOpt{-OutSeqFmt} = 'Experiment';
      $ProgMode = 'break';
      return;
    }

    # ProgMode CutMotif: we need motif switch
    if ($switch =~ m/^CutMotif=(\w+)$/i) {
      my $pMtfLib = SeqLab::MotifLib->new();
      $pMtfLib->AddSwitch(-debug=>$debug);
      if ($pMtfLib->Load('restric')) {
        $debug and printf STDERR "%s. %d motif%s in restric library\n", &MySub,
          $pMtfLib->Size(), ($pMtfLib->Size()==1)?'':'s';
      } else {
        die sprintf "%s. unable to find restric library\n", &MySub;
      }
      my ($pMtf) = grep{ $_->ID() eq $1 } @{$pMtfLib->{motif}};
      unless ($ProgOpt{-motif}=$pMtf) {
        die sprintf "ERROR: unable to find motif entry for restriction enzyme %s\n", $1||"''";
      }
      $ProgMode = 'CutMotif';
      return;
    }

    $ProgMode = $switch;
    return;
  }
}


################################################################################
# basic I/O
################################################################################


# produce sequence array data structure from sequence input files
#
# INTERFACE
# - global options:
#   -debug      [STD]
#   -SlcKnown   [STD]
#
# - return val: - reference to sequence data structure
#               - undef if queue is empty or an error occurs
#
# DESCRIPTION
# - object SeqLab::SeqStreamIn initialisation has been done in the MAIN body
#   or in the ProgMode function.
#
sub SeqQueue {
  my ($debug);
  my ($pSeq);
  my ($RegexpSplit, @fragment, $FragLen);

  # function parameters
  $debug = $ProgOpt{-debug};

  # redo loop
  {

    # get next sequence from input queue
    unless ($pSeq = $ProgParam{store}{queue}{SeqIn}->GetNext()) {
      return undef;
    }
    $debug and printf STDERR "%s. preparing sequence %s\n", &MySub, $$pSeq{id}||"''";
    $$pSeq{SeqPure} ||= &SeqStrPure ($$pSeq{sequence}, -upper=>1);

    # masking character for sequences
    # it's derived from sequence type (first entry)
    if (($ProgOpt{-SlcKnown} or $ProgMode =~ m/^mask/i)
        and ! $ProgParam{MaskChar}
        and $$pSeq{sequence}) {
      $ProgParam{MaskChar} = $SeqSmbUnk{&SeqType($$pSeq{SeqPure})};
      if ($ProgParam{MaskChar}) {
        $debug and printf STDERR "%s. sequence ID %s: type %s, masking character %s\n", &MySub,
          $$pSeq{id}||"''", scalar(&SeqType($$pSeq{SeqPure}))||"''", $ProgParam{MaskChar}||"''";
      } else {
        $ProgParam{MaskChar} ||= 'N';
        printf STDERR "%s. warning: unable to determine sequence type from sequence ID %s, assigning default masking character %s\n", &MySub,
          $$pSeq{id}||"''", $ProgParam{MaskChar}||"''";
      }
    }

    # select by length of known sequence
    if ($ProgOpt{-SlcKnown}) {
      $RegexpSplit = sprintf ('%s{%d,}', $ProgParam{MaskChar}, $ProgParam{MaskLen});
      @fragment = split (/$RegexpSplit/, $$pSeq{SeqPure});
      foreach (@fragment) {
        $FragLen = &Max ($FragLen, length $_);
      }
      if ($FragLen < $ProgOpt{-SlcKnown}) {
        $debug and printf STDERR "%s. sequence %s fails length filter %d (split %d x %s), fragment maximum = %d\n", &MySub,
          $$pSeq{id}||"''", $ProgOpt{-SlcKnown},
          $ProgParam{MaskLen}, $ProgParam{MaskChar},
          $FragLen;
        redo;
      }
    }
  }

  # return reference to sequence structure
  return $pSeq;
}


# work out output path base from sequence source file and/or identifier
# information
#
# INTERFACE
# - argument 1*: reference to sequence data structure
#
# - options:
#   -stamp       append to file-derived name stem if global switch -OutStump
#                is not given
#
# - global options:
#   -debug       [STD]
#   -OutDir      [STD]
#   -OutStump    [STD]
#
# - return val:  output path base
#
sub PrepOstump {
  my ($pSeq, %opt) = @_;
  my ($debug);
  my ($PathRef, $PathStamp);

  # function parameters
  $debug = $ProgOpt{-debug};

  # prepare output path base
  # 1st: $ProgOpt{-OutStump}
  # 2nd: $$pSeq{SrcPath} ('-' to 'stdin')
  # 3rd: $ProgArg[0]
  unless ($PathStamp = $ProgOpt{-OutStump}) {
    $PathRef = $$pSeq{SrcPath} || $ProgArg[0];
    if ($PathRef eq '-') { $PathRef = 'stdin'; }
    $PathStamp = &PathChgSuffix (&PathExpand($PathRef), '', -last=>1);
    $PathStamp .= $opt{-stamp};
  }

  # determine effective directory
  $ProgOpt{-OutDir} and $PathStamp =
    $ProgOpt{-OutDir} .'/'. &PathSplit($PathStamp)->{name};

  # final refinement: expand to nice, rooted path
  $PathStamp = &PathExpand ($PathStamp);

  # return path
  return $PathStamp;
}


# output line of introductory note preceding a table
#
# INTERFACE
# - argument 1: intro text line (incl. LF)
#
# - global options:
#   -OutTab     [STD]
#   -OutTabFmt  [STD]
#
# - global data:
#   $ProgParam{handle}{tab}  table output file handle
#
sub OutTabIntro {
  my ($LinePlain) = @_;
  my $hOutTab = $ProgParam{handle}{tab};
  my $bOutHtml = int (exists($ProgOpt{-OutTabFmt}) and $ProgOpt{-OutTabFmt} eq 'html');
  my $CodeCRLF = $bOutHtml? "<BR>\n" : "\n";
  my $CodeComm = $bOutHtml? '' : '# ';

  # convert LF syntax
  $LinePlain =~ s/\n/$CodeCRLF/;

  # output line
  print  $hOutTab "$CodeComm$LinePlain";
}


# output table header in chosen format
#
# INTERFACE
# - argument 1: reference to field array (header line)
#
# - global options:
#   -OutTab     [STD]
#   -OutTabFmt  [STD]
#
# - global data:
#   $ProgParam{handle}{tab}  table output file handle
#
sub OutTabHeadln {
  my ($pField) = @_;
  my $hOutTab = $ProgParam{handle}{tab};
  my $bOutHtml = int (exists($ProgOpt{-OutTabFmt}) and $ProgOpt{-OutTabFmt} eq 'html');

  # HTML format: start table
  if ($bOutHtml) {
    print  $hOutTab "<TABLE BORDER=0 CELLPADDING=2 CELLSPACING=2>\n";
  }

  # TAB format: hide column labels in comments
  else {
    print  $hOutTab "#\n# column labels:\n# ";
  }

  # output header line
  &OutTabLine ($pField);
}


# output table line in chosen format
#
# INTERFACE
# - argument 1: reference to field array
#
# - global options:
#   -OutTab     [STD]
#   -OutTabFmt  [STD]
#
# - global data:
#   $ProgParam{handle}{tab}  table output file handle
#
sub OutTabLine {
  my ($pField) = @_;
  my $hOutTab = $ProgParam{handle}{tab};
  my $bOutHtml = int (exists($ProgOpt{-OutTabFmt}) and $ProgOpt{-OutTabFmt} eq 'html');
  my $line;

  # HTML format
  if ($bOutHtml) {
    $line  = '<TR>';
    foreach (@$pField) {
      my $field = &HtmlCharEncode ($_);
      $field = $field || '&nbsp;';
      $line .= "<TD>$field</TD>";
    }
    $line .= '</TR>';
  }

  # TAB format
  else {
    $line = join ("\t", @$pField);
  }

  # output line
  print  $hOutTab "$line\n";
}


# output table tail in chosen format
#
# INTERFACE
# - global options:
#   -OutTab     [STD]
#   -OutTabFmt  [STD]
#
# - global data:
#   $ProgParam{handle}{tab}  table output file handle
#
sub OutTabTail {
  my $hOutTab = $ProgParam{handle}{tab};
  my $bOutHtml = int (exists($ProgOpt{-OutTabFmt}) and $ProgOpt{-OutTabFmt} eq 'html');

  # only for HTML something to do here
  if ($bOutHtml) {
    print  $hOutTab "</TABLE>\n";
  }
}


################################################################################
# sequence transformation
################################################################################


# temporary routine
#
# INTERFACE
# - global options:
#   --rept,N  ...
#
sub ProgTemp {
  my ($pSeq);
  #my ($iSeqLen, $SeqChar);

  # input sequence
  while ($pSeq = &SeqQueue()) {
    if ($$pSeq{head} =~ m/(ddbj|emb|gb|sp)\|([^|]+)/) { $$pSeq{id} = $2; }
    printf "%s\t%s\t%s\n", $$pSeq{id}, $$pSeq{organism}, $$pSeq{taxa};
  }
}

# temporary routine - join masking of two versions of the same sequence
#
sub ProgTempMaskJoin {
  my ($pSeq1, $pSeq2, $iSeqLen, $SeqChar);
  my ($PosCt);

  # input sequences
  $pSeq1 = &SeqQueue();
  $pSeq2 = &SeqQueue();
  if (! ($iSeqLen = length ($$pSeq1{sequence})) or
      length ($$pSeq1{sequence}) != length ($$pSeq2{sequence})) {
    printf STDERR "ERROR: sequence length is zero or inequal for seq 1/2\n";
    return;
  }

  # output sequence data structure
  for ($PosCt=0; $PosCt<$iSeqLen; $PosCt++) {
    if (($SeqChar = substr ($$pSeq2{sequence}, $PosCt, 1)) =~ m/n/i) {
      substr ($$pSeq1{sequence}, $PosCt, 1) = $SeqChar;
    }
  }

  # output sequence
  $ProgParam{store}{queue}{SeqOut}->Push($pSeq1);
}


# change sequence identifier
#
# INTERFACE
# - argument 1: search pattern
# - argument 2: replace expression
#
# - global options:
#   -debug      [STD]
#
sub ProgIdChg {
  my ($search, $replace) = @_;
  my ($debug);
  my ($pSeq);

  # function parameters
  $debug = $ProgOpt{-debug};

  # replacement parameters
  foreach (\$search, \$replace) {
    $$_ =~ s#(^|[^\\])/#$1\\/#g;
  }
  unless ($search) {
    print  STDERR "ERROR: no search pattern specified\n";
    exit 1;
  }
  $debug and printf STDERR "%s. search for: '%s', replace by: '%s'\n", &MySub,
    $search, $replace;

  # loop over sequences
  while (defined ($pSeq=&SeqQueue())) {

    # additional sequence features
    if (index($replace,'$$pSeq') >= 0) {
      $$pSeq{length} ||= length ($$pSeq{SeqPure} ||= &SeqStrPure ($$pSeq{sequence}));
    }

    # replace
    $debug and printf STDERR "%s. seq ID before: %s\n", &MySub, $$pSeq{id};
    eval "\$\$pSeq{id} =~ s/$search/$replace/gm";
    $debug and printf STDERR "%s. seq ID after: %s\n", &MySub, $$pSeq{id};
    delete $$pSeq{header};

    # output sequence
    $ProgParam{store}{queue}{SeqOut}->Push($pSeq);
  }
}


# prepare reverse-complement of sequence(s)
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
sub ProgRevcompl {
  my ($pSeq) = @_;

  # change sequence
  $pSeq = &SeqRevcompl ($pSeq);

  # output sequence
  $ProgParam{store}{queue}{SeqOut}->Push($pSeq);
}


# translate nucleotide to protein sequence
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
# - global options:
#   -debug      [STD]
#   -SlcFrame   default: all
#
sub ProgTransl {
  my ($pSeq) = @_;
  my ($debug, $dbg2, @frame);
  my ($ItFrame, %SeqProt);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  @frame = (exists $ProgOpt{-SlcFrame} and @{$ProgOpt{-SlcFrame}}) ?
    @{$ProgOpt{-SlcFrame}} : (1, 2, 3, -1, -2, -3);
  # sequence string was purified via SeqStreamIn

  # do translation
  foreach $ItFrame (@frame) {
    %SeqProt = %$pSeq;
    delete $SeqProt{header};
    $SeqProt{descr} .= ($SeqProt{descr} ? ', ':'') . 'translation';
    if (@frame>1 or $frame[0]!~m/1$/) {
      $SeqProt{id} .= sprintf ('_frame%+d', $ItFrame);
      $SeqProt{descr} .= ($SeqProt{descr}?', ':'') . "translation of frame $ItFrame";
    }
    $SeqProt{sequence} = &TranslNt ($$pSeq{sequence}, -frame=>$ItFrame, -debug=>$dbg2);

    # output protein sequence
    $ProgParam{store}{queue}{SeqOut}->Push(\%SeqProt);
  }
}


# translate protein to nucleotide sequence
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
# - global options:
#   -debug      [STD]
#
sub ProgRetransl {
  my ($pSeq) = @_;
  my ($debug, $dbg2);
  my (%EntryProt);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  # sequence string was purified via SeqStreamIn

  # do re-translation
  delete $$pSeq{header};
  $$pSeq{descr} .= ($$pSeq{descr} ? ', ':'') . 're-translation';
  $$pSeq{sequence} = &TranslProt ($$pSeq{sequence}, -debug=>$dbg2);

  # output protein sequence
  $ProgParam{store}{queue}{SeqOut}->Push($pSeq);
}


# convert GFF-formatted annotation to Experiment file format tags
#
# INTERFACE
# - argument 1+: *.gff input file(s)
#
# - global options:
#   -debug       [STD]
#
sub ProgGffToExper {
  my (@PathIn) = @_;
  my ($debug, $dbg2);
  my ($PathCurr, $pGffStruct, %GeneStruct, $pGene);
  my ($pAnnot, $pAnnotNext, @feature);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;

  # loop over input sources, read GFF
  foreach $PathCurr (@PathIn) {
    $pGffStruct = &GffStruct ($PathCurr, -debug=>$dbg2);

    # loop over exon annotations
    for $pAnnot (@$pGffStruct) {
      $$pAnnot{feature} =~ m/^(cds(_exon)?|exon(_cds)?|first|internal|single|terminal)$/i or next;

      # enter new gene, add to existing gene
      $pGene = $GeneStruct{$$pAnnot{id}}{$$pAnnot{group}} ||= {};
      $$pGene{orient} ||= $$pAnnot{orient};
      if ($$pGene{orient} != $$pAnnot{orient}) {
        printf STDERR "WARNING: feature %s, range %d..%d %s, is anti against gene, enter?\n",
          $$pAnnot{feature}, $$pAnnot{offset}, $$pAnnot{end}, &SignChar($$pAnnot{orient});
        unless (&QueryConfirm()) { next }
        $$pAnnot{orient} = $$pGene{orient};
      }
      push @{$$pGene{feature}}, $pAnnot;
    }

    # loop over genes (each being a group of exons)
    for $pGene (@{ &DataTreeSlc (\%GeneStruct, [[0,'all'],[0,'all']], -debug=>$dbg2) }) {
      $$pGene{id} = $$pGene{feature}[0]{id};
      $$pGene{method} = $$pGene{feature}[0]{method};
      $$pGene{group} = $$pGene{feature}[0]{group};
      $$pGene{offset} = &Min (map { ($_->{offset}, $_->{end}); }
        @{$$pGene{feature}});
      $$pGene{end}    = &Max (map { ($_->{offset}, $_->{end}); }
        @{$$pGene{feature}});
      $$pGene{score}  = &Min (map { $_->{score} } @{$$pGene{feature}});
      @feature = sort { $$a{offset}*$$a{orient} <=> $$b{offset}*$$b{orient}
        } @{$$pGene{feature}};
      delete $$pGene{feature};
      $pAnnot = shift @feature;
      if ($$pAnnot{method} eq 'glimmerM') {
        push @{$$pGene{feature}}, {
          %$pAnnot,
          feature => 'start',
          offset  => ($$pAnnot{orient} > 0) ? $$pAnnot{offset}   : $$pAnnot{end}-2,
          end     => ($$pAnnot{orient} > 0) ? $$pAnnot{offset}+2 : $$pAnnot{end}  ,
          }, {
          %$pAnnot,
          feature => 'stop',
          offset  => ($$pAnnot{orient} > 0) ? $$pAnnot{end}-2 : $$pAnnot{offset}  ,
          end     => ($$pAnnot{orient} > 0) ? $$pAnnot{end}   : $$pAnnot{offset}+2,
          };
      }
      while ($pAnnotNext = shift @feature) {
        push @{$$pGene{feature}}, {
          %$pAnnot,
          feature => 'SpliceDonor',
          offset  => ($$pAnnot{orient} > 0) ? $$pAnnot{end}+1 : $$pAnnot{offset}-2,
          end     => ($$pAnnot{orient} > 0) ? $$pAnnot{end}+2 : $$pAnnot{offset}-1,
          };
        push @{$$pGene{feature}}, {
          %$pAnnot,
          feature => 'SpliceAccept',
          offset  => ($$pAnnot{orient} > 0) ? $$pAnnotNext{offset}-2 : $$pAnnotNext{end}+1,
          end     => ($$pAnnot{orient} > 0) ? $$pAnnotNext{offset}-1 : $$pAnnotNext{end}+2,
          };
        $pAnnot = $pAnnotNext;
      }

      # output Experiment format
      printf "ID   %s\nTC   ENZ2 %s %d..%d\nTC        model#=%s\nTC        evidence=%s\n%s",
        $$pGene{id}, &SignChar($$pGene{orient}, -allow0=>'='),
        $$pGene{offset}, $$pGene{end},
        $$pGene{group}, $$pGene{method},
        ($$pGene{score} =~ m/\d/) ? "TC        score=$$pGene{score}\n" : '';
      foreach $pAnnot (@{$$pGene{feature}}) {
      printf "TC   ENZ8 %s %d..%d\nTC        %s\nTC        evidence=%s\n",
        &SignChar($$pAnnot{orient},-allow0=>'='),
        $$pAnnot{offset}, $$pAnnot{end},
        $$pAnnot{feature}, $$pAnnot{method};
      }
    }

    # debug data structure
    $debug and &DataPrint(\%GeneStruct,-handle=>\*STDERR);

  } # loop over input files
}


# convert sequence according to bisulfite method
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
# - global options:
#   -debug      [STD]
#   --freqB     rel. bisulfitation frequency
#   --freqM     rel. methylation frequency
#   --rept      repeat sequence conversion
#   --U         use dUTP in PCR
#
# DEVELOPER'S NOTES
# - the sequence string was purified in SeqStreamIn object
#
sub ProgBisulfite {
  my ($pSeq) = @_;
  my ($bMe, $debug, $rept, $SmbNew,
      $bFreq, $freqB_glob, @freqB, $freqM_glob, @freqM);
  my (%match, $status, $bMethyl, $bTurnOv);
  my ($pSeqTreat);

  # function parameters
  $bMe = ((getlogin()||getpwuid($<)) eq 'szafrans') ? 1 : 0;
  $debug = $ProgOpt{-debug};
  if (
    ($ProgOpt{-var}{freqB} and $ProgOpt{-var}{freqB}!=1.0) or
    ($ProgOpt{-var}{freqM} and $ProgOpt{-var}{freqM}!=1.0)
  ) {
    $bFreq = 1;
    @freqB = $ProgOpt{-var}{freqB} ? split(/,/,$ProgOpt{-var}{freqB}) : (1.0);
    $freqB_glob = &SampleMean (\@freqB);
    @freqM = $ProgOpt{-var}{freqM} ? split(/,/,$ProgOpt{-var}{freqM}) : (1.0);
    $freqM_glob = &SampleMean (\@freqM);
  }
  $rept = ($bFreq and $ProgOpt{-var}{rept}) ? $ProgOpt{-var}{rept} : 1;
  $SmbNew = $ProgOpt{-var}{U} ? 'U' : 't';

  # do transformation, simple version
  if (! $bFreq) {
    while ($$pSeq{sequence} =~ m/[cC]([^gG])/) {
      $$pSeq{sequence} = $` . $SmbNew . $1 . $';
    }

    # output transformed sequence
    $ProgParam{store}{queue}{SeqOut}->Push($pSeq);
  }

  # do transformation, complicated version
  else {
    for (my $CtRept=0; $CtRept<$rept; $CtRept++) {
      $pSeqTreat = &DataClone ($pSeq);
      my ($CtC, $CtCG) = (0, 0);
      while ($$pSeqTreat{sequence} =~ m/([cC])(.)/g) {
        %match = (
          pre  => $`,
          pos  => length($`) + 1,
          C    => $1,
          follw  => $2,
          post => $',
          );

        # randomise to get single treated/cloned molecule
        if (uc($2) ne 'G') {
          $status = 'C';
          $bMethyl = 0;
        } else {
          $status = 'CpG';
          $bMethyl = 1;
          if (($freqM[$CtCG]||=$freqM_glob) != 1.0) {
            $bMethyl = (rand()<$freqM[$CtCG]) ? 1 : 0;
          }
          $CtCG ++;
        }
        $bTurnOv = $bMethyl ? 0 : 1;
        if ($bTurnOv and ($freqB[$CtC]||=$freqB_glob) != 1.0) {
          $bTurnOv = (rand()<$freqB[$CtCG]) ? 1 : 0;
        }
        $CtC ++;
        # construct molecule
        $debug||$bMe and printf STDERR "%s. %s (%s) position %d, methyl %d, turn-over %d\n", &MySub,
          $status, ($status eq 'CpG')?sprintf('%d,%d',$CtC,$CtCG):$CtC,
          $match{pos}, $bMethyl, $bTurnOv;
        $$pSeqTreat{sequence} = $match{pre}
          . ($bTurnOv?$SmbNew:$match{C}) . $match{follw}
          . $match{post};
        pos($$pSeqTreat{sequence}) = $match{pos};
      }

      # output transformed sequence
      $ProgParam{store}{queue}{SeqOut}->Push($pSeqTreat);
    } # for ($CtRept...
  }
}


################################################################################
# sequence fragmentation and concatenation
################################################################################


# cut into equally-sized fragments
#
# INTERFACE
# - argument 1: reference to source sequence entry (sequence string is
#               purified)
#
# - global options:
#   BreakLen    length of target fragments, processed from program mode switch
#               -break and possibly defaults
#   BreakOlap   length of target fragment overlap, processed from program mode
#               switch -break and possibly defaults
#   -debug      [STD]
#   -OutSeqFmt=Experiment
#               construct directed assembly
#   --OlapN     Ns in overlap region
#
sub ProgBreak {
  my $QualExtern = 2;
  my ($pSeq) = @_;
  my ($debug, $dbg2, $bExper, $bSimple, $BreakLen, $BreakOlap);
  my ($iSeqLen, $ItPos, @PosCut);
  my ($pSeqFrag, $CtSeq, $SeqFragLen, $SeqLastID, $CtI);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;

  $BreakLen  = $ProgOpt{BreakLen};
  $BreakOlap = $ProgOpt{BreakOlap};
  $bExper = (exists($ProgOpt{-OutSeqFmt}) and $ProgOpt{-OutSeqFmt} eq 'Experiment') ? 1:0;
  $bSimple = (! exists($ProgOpt{-OutSeqFmt}) or $ProgOpt{-OutSeqFmt}=~m/^(fastA|plain|table)$/) ? 1:0;
  $debug and printf STDERR "%s. cutting into fragments of size %d, overlap %d\n", &MySub,
    $BreakLen, $BreakOlap;
  $bExper and &SeqExperID ($pSeq);

  # get array of cut positions
  $iSeqLen = length $$pSeq{sequence};
  $ItPos = 1;
  while (($ItPos += $BreakLen - $BreakOlap) + $BreakOlap < $iSeqLen) {
    push @PosCut, $ItPos;
  }

  # cut and loop over sequence fragments
  undef $CtSeq;
  foreach $pSeqFrag ( map { @$_ }
    &SeqCutArray ($pSeq, \@PosCut, -olap=>$BreakOlap, -simple=>$bSimple, -debug=>$dbg2)
  ) {
    $debug and printf STDERR "%s. generated fragment %s\n", &MySub, $$pSeqFrag{id};

    # turn overlap (left end of 2nd and following fragments) to Ns if the
    # destined sequence format is "Experiment"
    if ($CtSeq and ($bExper or $ProgOpt{-var}{OlapN})) {
      substr($$pSeqFrag{sequence},0,$BreakOlap) = 'N' x $BreakOlap;
    }

    $CtSeq ++;

    # change entry to be part of Experiment, directed assembly
    if ($bExper) {

      # enter AP line
      if ($CtSeq == 1) {
        $$pSeqFrag{AP} = '*new* +';
      } else {
        $$pSeqFrag{AP} = sprintf ("%s + %d -1", $SeqLastID, $BreakLen - $BreakOlap);
      }

      # enter template name
      $$pSeqFrag{TN} = $$pSeq{id};

      # enter full-sequence annotation (flag for extern sequence)
      # default: $ProgParam{default}{AnnotLbl}{extern}
      if (! exists ($ProgOpt{-AnnotLbl}) or $ProgOpt{-AnnotLbl}) {
        push @{$$pSeqFrag{annot}}, {
          type   => exists($ProgOpt{-AnnotLbl}) ? $ProgOpt{-AnnotLbl} : $ProgParam{default}{AnnotLbl}{extern},
          offset => 1,
          end    => length ($$pSeqFrag{sequence}),
          text   => $$pSeq{descr},
          layer  => $ProgOpt{-AnnotLayer},
          };
      }

      # enter AV data line to construct directed assembly
      delete $$pSeqFrag{AV};
      $SeqFragLen = length ($$pSeqFrag{sequence});
      for ($CtI=0; $CtI<$SeqFragLen; $CtI+=20) {
        push @{$$pSeqFrag{AV}}, ($CtI ? '     ' : '') . ("$QualExtern " x &Min($SeqFragLen-$CtI,20));
      }

      # remind sequence identifier
      push @{$ProgParam{store}{SeqID}}, $$pSeqFrag{id};
      $SeqLastID = $$pSeqFrag{id};
      print  "writing sequence fragment $SeqLastID\n";
    }

    # output sequence entry
    $ProgParam{store}{queue}{SeqOut}->Push($pSeqFrag);
  }
}


# cut at motif hit positions
#
# INTERFACE
# - argument 1: reference to source sequence entry (sequence string is
#               purified)
# - argument 2: reference to motif definition, cmp. SeqLab::MotifX.pm
#
# - global options:
#   -debug      [STD]
#
# DEBUG, CHANGES, ADDITIONS
# - currently, the cut is done leftwards of the hit position. The user,
#   instead may want the hit motif to be cut out (it has a size) or to be
#   part of both sequence fragments.
# - motif search currently works on purified sequence only.
#   We could try to remap the positions onto the original gapped sequences like
#   it's done in SeqMotif.pl's &ProgSearchMotif
#
sub ProgCutMotif {
  my ($pSeq) = @_;
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;
  my $pMtf = $ProgOpt{-motif};
  $debug and printf STDERR "%s. cutting at motif %s, definition <%s>\n", &MySub,
    $pMtf->{id}||"''", $pMtf->{def}||"''";

  # do search, get array of positions
  my $pHit = $pMtf->Search ($$pSeq{sequence}, -isPure=>1, -strands=>0, -debug=>$dbg2);
  my @PosCut = map { $_->{offset} } @$pHit;
  $debug and printf STDERR "%s. got motif match positions: %s\n", &MySub,
    join (', ', @PosCut)||"''";

  # output sequence fragments
  foreach my $pSeqFrag (@{ &SeqCutArray ($pSeq, \@PosCut, -olap=>length($$pHit[0]{instance}), -debug=>$dbg2) }) {
    $ProgParam{store}{queue}{SeqOut}->Push($pSeqFrag);
  }
}


# list blocks of Ns (possibly HTGS fragment separators)
#
# INTERFACE
# - argument 1: reference on sequence data structure
#
# - global options:
#   -debug      [STD]
#
sub ProgListNblock {
  my ($pSeq) = @_;
  my ($debug, $NumN);

  # parameters
  $debug = $ProgOpt{-debug};
  $NumN = $ProgOpt{-var}{numN} || 100;

  # find N block of minimal length
  while ($$pSeq{sequence} =~ m/(N-*){$NumN}/iog) {
    my $lenpure = $NumN;
    my $end = pos($$pSeq{sequence});
    my $off = $end - length($&) + 1;

    # determine end ofN block => complete length
    if ($$pSeq{sequence} =~ m/[^nN-]/ig) {
      my $follw = substr ($$pSeq{sequence}, $end, pos($$pSeq{sequence})-$end-1);
      $end = pos($$pSeq{sequence}) - 1;
      $lenpure += &MatchCt (\$follw, '[nN]');
    }

    # output: seqID offset end pure_length
    # where offset/end refer to the (possibly) gapped sequence
    printf "%s\t%d\t%d\t%d\n", $$pSeq{id}, $off, $end, $lenpure;
  }
}


# cut HTGS summary sequence into contigs
#
# INTERFACE
# - argument 1: reference on sequence data structure
#
# - global options:
#   -debug      [STD]
#
sub ProgCutHTGS {
  my ($pSeq) = @_;
  my ($debug, $NumN);
  my ($sSeq, @SeqArr);

  # parameters
  $debug = $ProgOpt{-debug};
  $NumN = $ProgOpt{-var}{numN} || 100;

  # delete gi identifier
  $$pSeq{header} =~ s/gi\|\w+\|//;

  # split purified sequence string (should be prepared in SeqStreamIn object)
  $sSeq = $$pSeq{SeqPure} || $$pSeq{sequence};
  delete $$pSeq{sequence};
  delete $$pSeq{SeqPure};
  @SeqArr = map { { %$pSeq, sequence=>$_ }; }
    split (/N{$NumN,}/io, $sSeq);

  # change IDs of sequence fragments
  if (@SeqArr > 1) {
    my $ItBase = '';
    my $CtI = 0;
    foreach $pSeq (@SeqArr) {
      my $ItChar = pack 'C', ($CtI++) + 97;
      unless ($$pSeq{header} =~ s/((emb|gb)\|\w+\.\d+)/${1}${ItBase}${ItChar}/) {
        $$pSeq{id} .= $ItBase . $ItChar;
        delete $$pSeq{header};
      }
      if ($CtI > 25) { $CtI = 0; $ItBase .= 'z'; }
    }
  }

  # output sequence fragments
  $ProgParam{store}{queue}{SeqOut}->Push(@SeqArr);
}


# convert sequence according to UDG treatment and subsequent endonuclease
# cleavage
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
# - global options:
#   -debug      [STD]
#
# DEVELOPER'S NOTES
# - the sequence string was purified in SeqStreamIn object
#
sub ProgUDG {
  my ($pSeq) = @_;

  # do transformation
  $$pSeq{sequence} =~ s/U+/N/ig;

  # break sequence at stretches of Ns, output sequence fragments
  $ProgOpt{-var}{numN} = 1;
  &ProgCutHTGS ($pSeq);
}


# concatenate all sequences from input
#
# INTERFACE
# - global options:
#   -debug       [STD]
#   -concat      length of spacer between joint sequences
#
sub ProgConcat {
  my ($debug, $dbg2);
  my ($pSeqConcat, $pSeq, @SeqID);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;

  # create new sequence
  $pSeqConcat = &SeqQueue();
  $$pSeqConcat{id} = 'concat';

  # concatenate sequences
  while (defined ($pSeq=&SeqQueue())) {
    $pSeqConcat = &SeqConcat ($pSeqConcat, $pSeq, -spacer=>$ProgOpt{-concat}, -debug=>$dbg2);
    push @SeqID, $$pSeq{id};
  }
  $$pSeqConcat{descr} = sprintf "concatenated from %s", join (', ', @SeqID);
  if (length($$pSeqConcat{descr}) > 1024) {
    $$pSeqConcat{descr} = substr ($$pSeqConcat{descr}, 0, 1024) . ' ...';
  }

  # output sequence
  $ProgParam{store}{queue}{SeqOut}->Push($pSeqConcat);
}


################################################################################
# sequence annotation topology and feature extraction
################################################################################


# change annotation labels
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
# - global options:
#   -debug      [STD]
#
sub ProgAnnotRen {
  my ($pSeq) = @_;
  my ($debug, $pAnnotRen);
  my ($pAnnot, $AnnotNum);

  # function parameters
  $debug = $ProgOpt{-debug};
  $pAnnotRen = $ProgOpt{AnnotRen};

  # loop over annotations
  for ($AnnotNum=0; $AnnotNum<@{$$pSeq{annot}}; $AnnotNum++) {
    $pAnnot = $$pSeq{annot}[$AnnotNum];
    unless (exists $$pAnnotRen{$$pAnnot{type}}) { next }

    # delete annotation
    if (! $$pAnnotRen{$$pAnnot{type}}) {
      splice (@{$$pSeq{annot}}, $AnnotNum, 1);
      $AnnotNum--; next;
    }

    # or change label
    else {
      $$pAnnot{type} = $$pAnnotRen{$$pAnnot{type}};
    }
  }

  # output processed seq
  $ProgParam{store}{queue}{SeqOut}->Push($pSeq);
}


# expand annotation data structure
#
# INTERFACE
# - argument 1: reference on sequence data structure
#
# DESCRIPTION
# - the resulting sequence data structure is typically re-output in "struct"
#   format
#
sub ProgAnnotExpand {
  my ($pSeq) = @_;

  # enrich annotation in sequence
  #&AnnotRangeRes ($pSeq);
  &AnnotExpandTscr ($pSeq);

  # output sequence
  $ProgParam{store}{queue}{SeqOut}->Push($pSeq);
}


sub ProgAnnotRrnaConst {
$GlobStore{feat2bder} = {
  'small subunit ribosomal RNA' =>
    { '-1'=>'SSU_5p', '1'=>'SSU_3p' },
  '5.8S ribosomal RNA' =>
    { '-1'=>'58S_5p', '1'=>'58S_3p' },
  'large subunit ribosomal RNA' =>
    { '-1'=>'LSU_5p', '1'=>'LSU_3p' },
  '5S ribosomal RNA' =>
    { '-1'=>'5S_5p',  '1'=>'5S_3p' },
  'internal transcribed spacer 1' =>
    { '-1'=>'SSU_3p', '1'=>'58S_5p' },
  'internal transcribed spacer 2' =>
    { '-1'=>'58S_3p', '1'=>'LSU_5p' },
  'intergenic spacer 1' =>
    { '-1'=>'LSU_3p', '1'=>'5S_5p' },
  'intergenic spacer 2' =>
    { '-1'=>'5S_3p',  '1'=>'SSU_5p' },
  };
$GlobStore{bder2fullfeat} = {
  SSU_5p   => { SSU_3p   => { id=>'SSU' } },
  SSU_3p   => { '58S_5p' => { id=>'ITS1' } },
  '58S_5p' => { '58S_3p' => { id=>'58S' } },
  '58S_3p' => { LSU_5p   => { id=>'ITS2' } },
  LSU_5p   => { LSU_3p   => { id=>'LSU' } },
  LSU_3p   => { '5S_5p'  => { id=>'IGS1' } },
  '5S_5p'  => { '5S_3p'  => { id=>'5S' } },
  '5S_3p'  => { SSU_5p   => { id=>'IGS2' } },
  };
}

# extract rDNA sequence entries from GenBank relying on annotations
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
# DESCRIPTION
# - By default, extensive messaging is done to STDERR.
# - outline on procedure:
#   - select sequence entries
#   - ...
# - selection criteria for sequence entries:
#   - entry must contain annotation of type "rRNA"
#   - currently, output is confined to entries fitting taxonomy branch
#     "Eukaryota" being non-mitochondrial and non-plastid
#
# DEBUG, CHANGES, ADDITIONS
# - need to store dates (of submissions and publications) in a standardised and
#   sortable way.
# - currently, entries must contain annotation of type "rRNA" to be extracted
#   from the GenBank release. However, there are entries that have an
#   annotation of type "misc_RNA" exclusively. This is typically the case for
#   intergenic spacer sequences.
# - observed remaining cases of non-standardised rRNA species nomenclature (and
#   their frequencies):
#   '16S ribosomal RNA'           1378 [meaning?, cases increased lately]
#   '23S ribosomal RNA'           235 [meaning?, cases decreased(!) lately]
#   '19S ribosomal RNA'           2 [meaning?]
#   'ribosomal RNA' at beginning  253
#   /(small|large) ribosomal/     129 [missing 'subunit']
#   '28S-like ribosomal RNA'      43 [meaning?]
#   '23S-like ribosomal RNA'      15 [meaning?]
#   '16S-like ribosomal RNA' [implemented conversion to small subunit]
#   /ITS[12] ribosomal RNA/       40
#   '5.8 ribosomal RNA'           38 [missing 'S']
#   'mature rRNA'                 2
#   s/\bsub-unit\b/subunit/       6
#   s/\bsubuni[rt]\b/subunit/     1
#   'contains internal ...'       314 [enumeration of rDNA features]
#
sub ProgAnnotRrna {
  my ($pSeq) = @_;
  my ($pAnnot, @AnnotRrna, @AnnotBorder, @AnnotFFeat, $pEntry);

  # function constants
  my $reSSU = '\b(1[7-8]S|16S-like|SSU)\b';
  my $reLSU = '\b(2[5-8]S|LSU)\b';

  # loop over annotations
  foreach $pAnnot (grep { $_->{type} eq 'rRNA' } @{$$pSeq{annot}}) {

    # grab rRNA species label (hopefully, it's just this label, cf. below)
    $$pAnnot{text} =~ m/\bproduct="([^"]+?)"/ or
      $$pAnnot{text} =~ m/\bgene="([^"]+?)"/ or
      $$pAnnot{text} =~ m/\bnote="([^"]+?)"/;
    $$pAnnot{rRNA} = $$pAnnot{rRNA_orig} = $1;
    $$pAnnot{pseudo} = 1 if ($$pAnnot{text} =~ m/\b(truncated|pseudo(gene)?)\b/);
    $$pAnnot{pseudo_susp} = 1 if ($$pSeq{taxa} !~ m/\bActinopterygii\b/ and (grep /\bHi-b\b/, map{ $_->{text} }@{$$pSeq{annot}}));

    # enter rRNA annotation
    push @AnnotRrna, $pAnnot;
  }

  # select entries with rRNA annotations
  if (! @AnnotRrna) { return }

  # filter according to taxonomic status
  if ($$pSeq{taxa} !~ m/\bEukaryota\b/) { return }
  if ($$pSeq{organelle} =~ m/\b(chloroplast|kinetoplast|mitochond|plastid)/) { return }
  if ($$pSeq{organism} =~ m/\b(chloroplast|kinetoplast|mitochond|plastid)/) { return }
  if ($$pSeq{organelle} =~ m/\b(nucleomorph)/) {
    $$pSeq{organism} = $$pSeq{organelle} .' '. $$pSeq{organism};
    delete $$pSeq{organelle};
    $$pSeq{OrganismNum} = "($$pSeq{OrganismNum})";
  }
  if ($$pSeq{organelle}) {
    printf STDERR "%s. uncategorised organelle, seq %s: %s\n", &MySub,
      $$pSeq{id}, $$pSeq{organelle};
  }

  # enrich sequence entry data
  $$pSeq{SeqLength} = length ($$pSeq{sequence});

  # reformat subunit nomenclature
  # - universal coding for small/large subunit
  foreach (@AnnotRrna) {
    # leading redundant expressions
    $_->{rRNA} =~ s/^(nuclear) //i;
    # trailing bracketed/comma-separated expressions
    $_->{rRNA} =~ s/\s+\((NAR: \d+(, \d+)?|xxx)\)$//;
    $_->{rRNA} =~ s/(\s+(\d+))?\s+\(I[GT]S(\d)\)$/' '.($2||$3)/e;
    $_->{rRNA} =~ s/(\d+),\s+I[GT]S\1$/$1/e;
    # standard trailing "ribosomal RNA"
    $_->{rRNA} =~ s/ ([rR]ibo?s[ao]?r?mal |[rR]i{2}bosomal ||[rR]ibosome ||[rR]obosomal |r|)RNA/ ribosomal RNA/g;
    $_->{rRNA} =~ s/^($reSSU|5\.8S|$reLSU|5S)$/$& ribosomal RNA/o;
    $_->{rRNA} =~ s/\bribosomal RNA (gene)$/ribosomal RNA/;
    # standard subunit nomenclature
    $_->{rRNA} =~ s/(\d)\s+S\b/${1}S/g;  # non-spaced format for <size>S specification
    $_->{rRNA} =~ s/-subunit\b/ subunit/g;
    $_->{rRNA} =~ s/[0-9\.]+S\s+((large|small) subunit)/$1/g;
    $_->{rRNA} =~ s/$reSSU/small subunit/go;
    $_->{rRNA} =~ s/$reLSU/large subunit/go;
    if    ($$pSeq{taxa} =~ m/\bAlveolata\b/)  { $_->{rRNA} =~ s/\b24S\b/large subunit/g; }
    elsif ($$pSeq{taxa} =~ m/\bEuglenozoa\b/) { $_->{rRNA} =~ s/\b19S\b/small subunit/g; }
    # conversion failed?
    if ($_->{rRNA} !~ m/^(small subunit|5\.8S|large subunit|5S) ribosomal RNA$/ and
        $_->{rRNA} !~ m/^internal transcribed spacer [12]$/ and
        $_->{rRNA} !~ m/^intergenic spacer [12]$/
    ) {
      $_->{bad_nomen} = 1;
      printf STDERR "%s. unusual rRNA species, seq %s: %s -> %s\n", &MySub,
        $$pSeq{id}, $_->{rRNA_orig}||"''", $_->{rRNA}||"''";
    }
  }

  # examine reliability of features
  foreach $pAnnot (grep { ! $_->{pseudo} } @AnnotRrna) {
    if ($$pAnnot{range} =~ m/,/) { $$pAnnot{pseudo_susp} = 1; }
  }

  # find reliable feature borders
  # skip features showing "bad_nomen"
  { my ($pFeat, @range, $orient);
    foreach $pAnnot (grep { ! $_->{bad_nomen} } @AnnotRrna) {

      # match annotations, derive borders if there're overhangs
      if (exists $GlobStore{feat2bder}{$$pAnnot{rRNA}}) {
        $pFeat = $GlobStore{feat2bder}{$$pAnnot{rRNA}};
        @range = sort { $a<=>$b } grep { length($_) } split(/\D+/,$$pAnnot{range});
        $orient = ($$pAnnot{range} =~ m/complement/) ? -1 : 1;
        if ($range[0] > 1) {
          $$pAnnot{pseudo_susp} = 1 if ($$pAnnot{range} =~ m/[<>]=?$range[0]\b/);
          $$pAnnot{deriv_border} ++;
          push @AnnotBorder, {
            label  => $$pFeat{$orient*-1},
            pos    => $range[0] - 0.5,
            orient => $orient,
              (exists($$pAnnot{pseudo}) and $$pAnnot{pseudo}) ?
          ( pseudo => $$pAnnot{pseudo} ) : (),
              (exists($$pAnnot{pseudo_susp}) and $$pAnnot{pseudo_susp}) ?
          ( pseudo_susp => $$pAnnot{pseudo_susp} ) : (),
            };
        }
        if ($range[-1] < $$pSeq{SeqLength}) {
          $$pAnnot{pseudo_susp} = 1 if ($$pAnnot{range} =~ m/[<>]=?$range[-1]\b/);
          $$pAnnot{deriv_border} ++;
          push @AnnotBorder, {
            label  => $$pFeat{$orient},
            pos    => $range[-1] + 0.5,
            orient => $orient,
              (exists($$pAnnot{pseudo}) and $$pAnnot{pseudo}) ?
          ( pseudo => $$pAnnot{pseudo} ) : (),
              (exists($$pAnnot{pseudo_susp}) and $$pAnnot{pseudo_susp}) ?
          ( pseudo_susp => $$pAnnot{pseudo_susp} ) : (),
            };
        }
      } else {
        printf STDERR "%s. rRNA unusual later on, seq %s: %s\n", &MySub,
          $$pSeq{id}, $$pAnnot{rRNA}||"''";
      }
    }
  }

  # refine list of feature borders
  if (@AnnotBorder) {
    my ($CtI, $orient, @BderFlute);
    my $pBder2fFeat = $GlobStore{bder2fullfeat};

    # rational ordering of feature borders
    @AnnotBorder = sort {
      $a->{orient}<=>$b->{orient} or $a->{pos}<=>$b->{pos} or $a cmp $b
      } @AnnotBorder;

    # reduce doublettes of feature borders
    @BderFlute = sort { $a->{pos}<=>$b->{pos} or $a cmp $b } @AnnotBorder;
    for ($CtI=1; $CtI<@BderFlute; $CtI++) {
      if ($BderFlute[$CtI-1]{pos} == $BderFlute[$CtI]{pos}) {
        if ($BderFlute[$CtI-1]{label} ne $BderFlute[$CtI]{label} or
            $BderFlute[$CtI-1]{orient} != $BderFlute[$CtI]{orient}
        ) {
          printf STDERR "%s. conflict in feature border doublette, seq %s:"
            . " pos %s, label %s, orient %s\n", &MySub,
            $$pSeq{id}, $BderFlute[$CtI]{pos},
            join('/',map{ $_->{label} }@BderFlute[$CtI-1,$CtI]),
            join('/',map{ $_->{orient} }@BderFlute[$CtI-1,$CtI]);
          $BderFlute[$CtI-1]{pseudo} = $BderFlute[$CtI]{pseudo} = 1;
        } else {
          @AnnotBorder = grep{ $_ ne $BderFlute[$CtI] }@AnnotBorder;
          splice @BderFlute, $CtI, 1;
          $CtI --;
        }
      }
    }

    foreach $orient (-1, 1) {
      @BderFlute = grep { $_->{orient}==$orient } @AnnotBorder;
      @BderFlute = reverse(@BderFlute) if ($orient==-1);
      for ($CtI=1; $CtI<@BderFlute; $CtI++) {

        # reduce doublettes of feature borders
        if (exists($$pBder2fFeat{$BderFlute[$CtI-1]{label}}) and
            exists($$pBder2fFeat{$BderFlute[$CtI-1]{label}}{$BderFlute[$CtI]{label}}) and
        ! exists($BderFlute[$CtI-1]{pseudo}) and ! exists($BderFlute[$CtI]{pseudo}) and
        ! exists($BderFlute[$CtI-1]{pseudo_susp}) and ! exists($BderFlute[$CtI]{pseudo_susp})
        ) {
          push @AnnotFFeat, {
            label  => $$pBder2fFeat{$BderFlute[$CtI-1]{label}}{$BderFlute[$CtI]{label}}{id},
            length => abs ($BderFlute[$CtI-1]{pos}-$BderFlute[$CtI]{pos}),
            };
          printf STDERR "sampled full feature, seq %s: %s, length %d\n",
            $$pSeq{id}, $AnnotFFeat[-1]{label}, $AnnotFFeat[-1]{length};
        }
      }
    }
  }

  # construct/format and output seq entry data
  $pEntry = {
    SrcFile        => $$pSeq{SrcPath},
    AccNo          => $$pSeq{id},
    organism       => $$pSeq{organism},
    OrganismNum    => $$pSeq{OrganismNum},
      $$pSeq{organelle} ?
  ( organelle      => $$pSeq{organelle} ) : (),
    taxa           => $$pSeq{taxa},
    SeqLength      => $$pSeq{SeqLength},
    date           => $$pSeq{date},
    submit         => $$pSeq{submit},
    rRNA_annot     => [ ],
      int (grep{ $_->{bad_nomen} }@AnnotRrna) ?
  ( rRNA_bad_nomen => '1' ) : (),
      int(@AnnotBorder) ?
  ( rRNA_border    => [ @AnnotBorder ] ) : (),
      int(@AnnotFFeat) ?
  ( rRNA_fullfeature => [ @AnnotFFeat ] ) : (),
    };
  $$pEntry{SrcFile} =~ s|.+/||;
  $$pEntry{taxa} =~ s/\s+//g;
  foreach (@AnnotRrna) {
    push @{$$pEntry{rRNA_annot}}, {
      label     => $_->{rRNA},
      range     => $_->{range},
        $_->{bad_nomen} ?
    ( bad_nomen => $_->{bad_nomen} ) : (),
        $_->{pseudo} ?
    ( pseudo    => $_->{pseudo} ) : (),
        $_->{pseudo_susp} ?
    ( pseudo_susp => $_->{pseudo_susp} ) : (),
        $_->{deriv_border} ?
    ( deriv_border => $_->{deriv_border} ) : (),
      };
  }
  &DataPrint ([$pEntry]);
}


# work out certain annotation labels from complex annotations
#
# INTERFACE
# - argument 1: reference to sequence data structure
# - argument 2: reference to array of requested annotation labels
#
# - global options:
#   -debug      [STD]
#
# DEBUG, CHANGES, ADDITIONS
# - annotation label "TCS", see 
#
sub AnnotRequire {
  my ($pSeq,$pWantAnnot) = @_;
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;
  my %HaveAnnot = map{ ($_->{type}=>1) } @{$$pSeq{annot}};

  # test need for expansion of annotations
  foreach my $ItLabel (grep{ !$HaveAnnot{$_} } @$pWantAnnot) {
    if ($ItLabel =~ m/(CDS|[eE]xon|[iI]ntron|start|stop|TCS)/) {
    # does "TCS" work here? Test it!
      $debug and printf STDERR "%s. expanding transcript annotation\n", &MySub;
      &AnnotExpandTscr ($pSeq, -debug=>$dbg2);
      last;
    }
  }
  foreach my $ItLabel (grep{ !$HaveAnnot{$_} } @$pWantAnnot) {
    if ($ItLabel =~ m/(accept|donor|splice)/i) {
      $debug and printf STDERR "%s. expanding splice annotation\n", &MySub;
      &AnnotExpandSplice ($pSeq, -debug=>$dbg2);
      last;
    }
  }
}


# change annotated sequence to lower/upper case letters.
#
# INTERFACE
# - argument 1: requested letter case
# - argument 2: reference to sequence data structure
#
# - global options:
#   -debug      [STD]
#
sub ProgAnnotCase {
  my (%AnnotOrientStx) = ( '+'=>1, '-'=>-1, '='=>0 );
  my ($case, $pSeq) = @_;
  my ($debug, %SlcAnnotLbl, $SlcAnnotOrient);
  my ($pAnnot);

  # function parameters
  $debug = $ProgOpt{-debug};
  %SlcAnnotLbl = ( map { ($_=>1) } @{$ProgOpt{AnnotList}} );
  $debug and printf STDERR "%s. selected annotation labels: %s\n", &MySub, join(', ',keys(%SlcAnnotLbl));
  $SlcAnnotOrient = $AnnotOrientStx{$ProgOpt{-AnnotOrient}};
  $debug and printf STDERR "%s. selected annotation orientation: %d\n", &MySub, $SlcAnnotOrient;

  # expand complex annotations to requested annotations
  &AnnotRequire ($pSeq, $ProgOpt{AnnotList});

  # loop over annotations
  # - select for requested annotation type
  # - select for requested annotation orientation
  if (grep { exists($_->{range}) and !exists($_->{offset}) }
      grep { $SlcAnnotLbl{$_->{type}} }
      grep { ! $SlcAnnotOrient or $_->{orient}==$SlcAnnotOrient }
      @{$$pSeq{annot}}
  ) {
    &AnnotRangeRes ($$pSeq{annot});
  }
  foreach $pAnnot (
      grep { $SlcAnnotLbl{$_->{type}} }
      grep { ! $SlcAnnotOrient or $_->{orient}==$SlcAnnotOrient }
      @{$$pSeq{annot}}
  ) {

    # change seq in annotated range
    $debug and printf STDERR "%s. changing sequence range %d..%d\n", &MySub,
      $$pAnnot{offset}, $$pAnnot{end};
    if ($case eq 'lower') {
      substr ($$pSeq{sequence}, $$pAnnot{offset}-1, $$pAnnot{end}-$$pAnnot{offset}+1) =~ tr/A-Z/a-z/;
    } elsif ($case eq 'upper') {
      substr ($$pSeq{sequence}, $$pAnnot{offset}-1, $$pAnnot{end}-$$pAnnot{offset}+1) =~ tr/a-z/A-Z/;
    }
  }

  # output processed seq
  $ProgParam{store}{queue}{SeqOut}->Push($pSeq);
}


# mask annotated sequence ranges
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
# - global options:
#   -AnnotLyr   mask according only to annotations in specified layer
#   -debug      [STD]
#   -MaskAnnot  program mode argument, annotation label selector
#
# - global data:
#   $ProgParam{MaskChar}
#
# DESCRIPTION
# - mask both read-anchored and contig-anchored annotations
# - masking symbol depends on sequence type (nt/aa)
#
sub ProgAnnotMask {
  my ($pSeq) = @_;
  my ($debug, $SeqType, $SmbMask, %SlcAnnotLbl);
  my ($pAnnot, $CtAnnot);

  # function parameters
  $debug = $ProgOpt{-debug};
  %SlcAnnotLbl = ( map { ($_=>1) } @{$ProgOpt{-MaskAnnot}} );
  $SmbMask = $ProgParam{MaskChar};
  $debug and printf STDERR "%s. sequence %s, type %s, masking symbol %s\n", &MySub,
    $$pSeq{id}, scalar &SeqType($$pSeq{sequence}), $SmbMask;

  # expand complex annotations to requested annotations
  &AnnotRequire ($pSeq, $ProgOpt{-MaskAnnot});

  # loop over annotations
  for ($CtAnnot=0; $CtAnnot<@{$$pSeq{annot}}; $CtAnnot++) {
    $pAnnot = $$pSeq{annot}[$CtAnnot];
    $debug and printf STDERR "%s. found annotation in seq %s: label %s, range %d..%d, layer %s\n", &MySub,
      $$pSeq{id}, $$pAnnot{type}, $$pAnnot{offset}, $$pAnnot{end}, $$pAnnot{layer}||"''";

    # select for annotation label and annotation layer
    $SlcAnnotLbl{$$pAnnot{type}} or next;
    if (exists ($ProgOpt{-AnnotLyr}) and $$pAnnot{layer} ne $ProgOpt{-AnnotLyr}) { next }

    # check validity of annotation
    if ($$pAnnot{offset}<1 or $$pAnnot{offset}>length($$pSeq{sequence})) {
      printf STDERR "%s. WARNING: annotation offset outside sequence range: seq %s, range %d..%d, seq length %d\n", &MySub,
        $$pSeq{id}, $$pAnnot{offset}, $$pAnnot{end}, length($$pSeq{sequence});
      next;
    }
    if ($$pAnnot{end}<1 or $$pAnnot{end}>length($$pSeq{sequence}) or $$pAnnot{end}<$$pAnnot{offset}) {
      printf STDERR "%s. WARNING: annotation end outside sequence range: seq %s, range %d..%d, seq length %d\n", &MySub,
        $$pSeq{id}, $$pAnnot{offset}, $$pAnnot{end}, length($$pSeq{sequence});
      next;
    }

    # mask sequence string, delete annotation
    $debug and printf STDERR "  masking annotation range\n";
    substr ($$pSeq{sequence}, $$pAnnot{offset}-1, $$pAnnot{end}-$$pAnnot{offset}+1)
      =~ s/[a-z]/${SmbMask}/ig;
    splice @{$$pSeq{annot}}, $CtAnnot, 1;
    $CtAnnot --;
  }

  # output sequence and return
  $ProgParam{store}{queue}{SeqOut}->Push($pSeq);
}


# output annotated sequence ranges
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
# - global options:
#   AnnotList   program mode argument, annotation label selector
#   -debug      [STD]
#   -flank      flanking sequence to be output in addition to the requested
#               feature
#   --gap       maximum length of non-annotated sequence to be swallowed into
#               annotated range
#
# DESCRIPTION
# - CDS output does not provide the switch feature -SlcValid like &ProgCatCds
#   does.
# - list of annotations is unfiltered in the output sequence slices. It might
#   be advantageous to filter the annotation types which were used to extract
#   the sequence ranges. On the other hand, it might be useful to keep this
#   annotation data. So, better we just leave it as it is.
#
# DEBUG, CHANGES, ADDITIONS
# - joint of same-type annotations has to be re-implemented
#   - skip annotations having complex range definitions
#   - annotation ranges with conflicting orientation should never be joint
# - Currently, there's a problem joining annotations which have complex range
#   definitions. How can we combine these, just from the computational
#   perspective? And, what would be reasonable, from the biological perspective?
#
sub ProgAnnotCatseq {
  my ($pSeq) = @_;
  my ($debug, $dbg2, $pQueueOut);
  my ($pSeqAn, $ItAnnotType, @AnnotSlc, $pAnnot, $CtAnnot);
  my ($pSeqAdd, $pSeqSub);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $pQueueOut = $ProgParam{store}{queue}{SeqOut};

  # expand complex annotations to requested annotations
  &AnnotRequire ($pSeq, $ProgOpt{AnnotList});
  # the following drastically reduces the computation effort (time!) in case
  # we do not output annotations
  # - move the annotations to a safe place
  $pSeqAn = {
    annot    => $$pSeq{annot} || [],
    AnnotGrp => $$pSeq{AnnotGrp} || {},
    };
  # - delete the annotations from the sequence entry if they're not required
  #   for output
  if ($$pQueueOut{switch}{-file} ne 'rewrite' and
     ($$pQueueOut{switch}{-format} eq '' or
      $$pQueueOut{switch}{-format} eq 'fastA' or
      $$pQueueOut{switch}{-format} eq 'plain' or
      $$pQueueOut{switch}{-format} eq 'table')
  ) {
    delete $$pSeq{annot};
    delete $$pSeq{AnnotGrp};
  }

  # loop over annotation labels
  foreach $ItAnnotType (@{$ProgOpt{AnnotList}}) {

    # select all annotations of given type
    unless (@AnnotSlc = grep { $_->{type} eq $ItAnnotType } @{$$pSeqAn{annot}}) {
      unless (@AnnotSlc = grep { $_->{type} eq $ItAnnotType } values %{$$pSeqAn{AnnotGrp}}) {
        unless (@AnnotSlc =
          sort { ($a->{offset}||($a->{range}=~m/(\d+)/)[0]) <=> ($b->{offset}||($b->{range}=~m/(\d+)/)[0]) }
          grep { ref($_) eq 'HASH' and $_->{type} eq $ItAnnotType }
          map { values %$_ } values %{$$pSeqAn{AnnotGrp}}
        ) {
          printf STDERR "%s. WARNING: cannot find annotations of type %s, seq %s\n", &MySub,
            $ItAnnotType||"''", $$pSeq{id}||"''";
          next;
        }
      }
    }
    # prepare copies of annotations (safe for any following manipulations)
    @AnnotSlc = map { \%{$_} } @AnnotSlc;
#    # delete effective range for selected annotation type
#    # so, sequences in output won't contain these annotation types anymore
#    $$pAnnot{offset} = $$pAnnot{end} = 0;

    # loop over annotations of selected type
    for (my $i=0; $i<@AnnotSlc; $i++) {
      $pAnnot = $AnnotSlc[$i];

      # harmonise range information
      # - ensure: offset / end values
      # - ensure: complex range value
      if (exists $$pAnnot{range} and $$pAnnot{range}) {
        $$pAnnot{offset} ||= ($$pAnnot{range} =~ m/(\d+)/)[0];
        $$pAnnot{end} ||= ($$pAnnot{range} =~ m/.*\b(\d+)/)[0];
        $$pAnnot{orient} ||= ($$pAnnot{range} =~ m/complement/) ? -1 : +1;
        $$pAnnot{bRangeCplx} = 1;
      } else {
        $$pAnnot{range} = "$$pAnnot{offset}..$$pAnnot{end}";
        if ($$pAnnot{orient} < 0) {
          $$pAnnot{range} = "complement($$pAnnot{range})";
        }
      }

      # test for overlaps and bridging gaps in previous annotation ranges
      if (! $$pAnnot{bRangeCplx} and
          exists $ProgOpt{-var}{gap} and defined $ProgOpt{-var}{gap}) {
        for (my $j=0; $j<$i; $j++) {
          $AnnotSlc[$j]{bRangeCplx} and next;
          if ($AnnotSlc[$j]{orient} != $$pAnnot{orient}) { next }
          if ($AnnotSlc[$j]{end}+$ProgOpt{-var}{gap} < $$pAnnot{offset}-1) { next }

          # overlap or gap => join annotation data structures
          $debug and printf STDERR "%s. joining annotations: ranges %s / %s\n", &MySub,
            $AnnotSlc[$j]{range}, $$pAnnot{range};
          # currently commented off this code feature
          $AnnotSlc[$j]{end} = &Max ($AnnotSlc[$j]{end}, $$pAnnot{end});
          $AnnotSlc[$j]{range} = "$AnnotSlc[$j]{offset}..$AnnotSlc[$j]{end}";
          if ($AnnotSlc[$j]{orient} < 0) {
            $AnnotSlc[$j]{range} = "complement($AnnotSlc[$j]{range})";
          }
          splice @AnnotSlc,$i,1; --$i;
          last;
        }
      }
    }

    # modify range definitions according to program switch -flank
    # - $$pAnnot{OffAdd} and $$pAnnot{EndAdd} specify range overhangs that
    #   cannot be derived from the sequence because the distance of the feature
    #   from one of the sequence ends is less than the requested flank size.
    #   They will be filled with preceding/trailing gaps.
    if ($ProgOpt{-flank}) {
      foreach $pAnnot (@AnnotSlc) {
        $$pAnnot{offfin} = $$pAnnot{offset} - $ProgOpt{-flank};
        if ($$pAnnot{offfin} < 1) {
          $$pAnnot{OffAdd} = -$$pAnnot{offfin} + 1;
          $$pAnnot{offfin} = 1;
        }
        $$pAnnot{range} =~ m/(\d+)/ and
          $$pAnnot{range} = $` . $$pAnnot{offfin} . $';
        $$pAnnot{endfin} = $$pAnnot{end} + $ProgOpt{-flank};
        if ($$pAnnot{endfin} > length($$pSeq{sequence})) {
          $$pAnnot{EndAdd} = length($$pSeq{sequence}) - $$pAnnot{endfin};
          $$pAnnot{endfin} = length ($$pSeq{sequence});
        }
        if ($$pAnnot{orient} < 0) {
          @{$pAnnot}{'OffAdd','EndAdd'} = @{$pAnnot}{'EndAdd','OffAdd'};
        }
        $$pAnnot{range} =~ m/(.*\b)(\d+)/ and
          $$pAnnot{range} = $` . $1 . $$pAnnot{endfin} . $';
      }
    }

    # output annotation range, including flanks
    $CtAnnot = 0;
    foreach $pAnnot (@AnnotSlc) {
      $CtAnnot ++;

      # prepare sequence corresponding to annotation range
      $debug and printf STDERR "%s. preparing sequence range: seq %s, range %s\n", &MySub,
        $$pSeq{id}, $$pAnnot{range};
      $pSeqSub = &SeqCplxRange ($pSeq, $$pAnnot{range}, -isPure=>$ProgOpt{-pure});
      $$pSeqSub{descr} =~ m/(.*)range/;
      $$pSeqSub{descr} = $` . $1 .
        sprintf ('%s%s%s #%d, ',
          $$pSeq{descr} ? ', ':'',
          $$pAnnot{group} ? "group $$pAnnot{group}, " : '',
          $ItAnnotType, $CtAnnot) .
        'range' . $';
      $$pSeqSub{id} .= sprintf ('_%s_%d', $ItAnnotType, $CtAnnot);

      # add gaps to have flanks of constant length in all sequences
      if ($$pAnnot{OffAdd}) {
        $pSeqAdd = {
          id => $$pSeqSub{id},
          descr => $$pSeqSub{descr},
          sequence => ('-'x $$pAnnot{OffAdd}) };
        $pSeqSub = SeqConcat ($pSeqAdd, $pSeqSub);
        $$pSeqSub{descr} = $$pSeqAdd{descr};
      }
      if ($$pAnnot{EndAdd}) {
        $pSeqAdd = { id=>$$pSeqSub{id}, descr=>$$pSeqSub{descr},
                     sequence=>('-'x $$pAnnot{EndAdd}) };
        $pSeqSub = SeqConcat ($pSeqSub, $pSeqAdd);
        $$pSeqSub{descr} = $$pSeqAdd{descr};
      }
      if ($ProgOpt{-var}{uplow}) {
        if ($ProgOpt{-flank}) {
          substr ($$pSeqSub{sequence},0,$ProgOpt{-flank})
            = lc(substr($$pSeqSub{sequence},0,$ProgOpt{-flank}));
          substr ($$pSeqSub{sequence},-$ProgOpt{-flank},$ProgOpt{-flank})
            = lc(substr($$pSeqSub{sequence},-$ProgOpt{-flank},$ProgOpt{-flank}));
        }
        substr ($$pSeqSub{sequence},$ProgOpt{-flank},$$pAnnot{end}-$$pAnnot{offset}+1)
          = uc(substr($$pSeqSub{sequence},$ProgOpt{-flank},$$pAnnot{end}-$$pAnnot{offset}+1));
      }

      # push sequence onto output queue
      $pQueueOut->Push($pSeqSub);
    }

  } # end: loop over annotation types
}


# genes' upstream regions from E.coli genome
#
# INTERFACE
# - argument 1: reference on sequence data structure
#
# - global options:
#   -debug      [STD]
#   --nonred    non-redundant regions, default: output both orientations for
#               head-to-head gene pairs.
#
# DESCRIPTION
# - generally applicable to GenBank-formatted bacterial genomes - don't know
#   about other formats.
# - intergenic regions are output in orientation of the upstream gene. If
#   there's a head-to-head arrangement the intergenic region will be output
#   twice (in the two possible orientations).
#
# DEBUG, CHANGES, ADDITIONS
# - would it also be possible to analyse spliced genes (CDSs)?
#
sub ProgIntgenBactCatseq {
  my ($pSeq) = @_;
  my ($debug, $bNonred);
  my ($CtAnnot, $pAnnot, $pAnnotNgb);
  my ($CtNgb, %intgen, %SeqRangeHad, $pSeqSub, $CtSeqOut);

  # function parameters
  $debug = $ProgOpt{-debug};
  $bNonred = (exists $ProgOpt{-var}{nonred}) ? $ProgOpt{-var}{nonred} : 0;

  # metaformat annotations
  $$pSeq{descr} =~ s/, complete genome\./ genome/;
  &AnnotRangeRes ($$pSeq{annot});
  if ($debug) {
    printf STDERR "%s. sequence %s\n", &MySub, $$pSeq{id};
    printf STDERR "  length: %d\n", length ($$pSeq{sequence});
    printf STDERR "  elementary annotations: %d\n", int @{$$pSeq{annot}};
  }

  # convert non-protein coding gene annotations
  for ($CtAnnot=0; $CtAnnot<@{$$pSeq{annot}}; $CtAnnot++) {
    $pAnnot = $$pSeq{annot}[$CtAnnot];
    # we assume strict annotation order

    # identify RNA gene annotations
    # - delete gene annotation
    # - convert RNA annotation to "gene_other" annotation
    if ($$pAnnot{type} =~ m/^(misc_|r|t)RNA$/) {
      $pAnnotNgb = (grep { $_->{type} eq 'gene' and
          $_->{offset} == $$pAnnot{offset} and
          $_->{orient} == $$pAnnot{orient} and
        1 } $$pSeq{annot}[$CtAnnot-1], $$pSeq{annot}[$CtAnnot+1])[0];
      if ($pAnnotNgb) {
        splice @{$$pSeq{annot}},
          ($pAnnotNgb eq $$pSeq{annot}[$CtAnnot-1]) ? $CtAnnot-1 : $CtAnnot+1, 1;
        $CtAnnot --;
        printf STDERR "%s. deleting gene annotation corresponding to RNA annotation\n", &MySub;
        printf STDERR "  label: %s\n", $$pAnnot{type};
        printf STDERR "  range: %d..%d\n", $$pAnnot{offset}, $$pAnnot{end};
        printf STDERR "  orientation: %s\n", &SignChar ($$pAnnot{orient});
      } else {
        printf STDERR "%s. do not find neighbouring gene annotation corresponding to RNA annotation\n", &MySub;
        printf STDERR "  label: %s\n", $$pAnnot{type};
        printf STDERR "  range: %d..%d\n", $$pAnnot{offset}, $$pAnnot{end};
        printf STDERR "  orientation: %s\n", &SignChar ($$pAnnot{orient});
      }
      $$pAnnot{type} = 'gene_other';
    }
  }

  # loop over protein-coding gene annotations
  for ($CtAnnot=0; $CtAnnot<@{$$pSeq{annot}}; $CtAnnot++) {
    $pAnnot = $$pSeq{annot}[$CtAnnot];

    # identify gene annotations, only protein-coding genes are left
    if ($$pAnnot{type} ne 'gene') { next }

    # identify neighbouring gene annotation, any type
    $CtNgb = $CtAnnot - $$pAnnot{orient};
    while ($$pSeq{annot}[$CtNgb]{type} !~ m/^gene/ and $CtNgb < int(@{$$pSeq{annot}})) {
      $CtNgb -= $$pAnnot{orient};
    }
    if ($CtNgb >= 0 and $CtNgb < int(@{$$pSeq{annot}})) {
      $pAnnotNgb = $$pSeq{annot}[$CtNgb];
    } else {
      printf STDERR "%s. gene annotation without neighbour: %s %d..%d\n", &MySub,
        &SignChar ($$pAnnot{orient}), $$pAnnot{offset}, $$pAnnot{end};
      next;
    }

    # filter gene overlaps
    $intgen{orient} = $$pAnnot{orient};
    if (($intgen{orient} > 0 and $$pAnnotNgb{end} >= $$pAnnot{offset}) or
        ($intgen{orient} < 0 and $$pAnnot{end} >= $$pAnnotNgb{offset})
    ) {
      printf STDERR "%s. overlapping gene annotations: %s %d..%d / %s %d..%d\n", &MySub,
        &SignChar ($$pAnnotNgb{orient}), $$pAnnotNgb{offset}, $$pAnnotNgb{end},
        &SignChar ($$pAnnot{orient}), $$pAnnot{offset}, $$pAnnot{end};
      next;
    }

    # construct complex range definition
    $intgen{offset} = ($intgen{orient} > 0) ? $$pAnnotNgb{end}+1 : $$pAnnot{end}+1;
    $intgen{end} = ($intgen{orient} > 0) ? $$pAnnot{offset}-1 : $$pAnnotNgb{offset}-1;
    if ($intgen{offset} >= $intgen{end} and abs($intgen{offset}-$intgen{end}) < 4) {
      printf STDERR "%s. intergenic NULL range for gene at: %s %d..%d, neighbour %s %d..%d\n", &MySub,
        &SignChar ($$pAnnot{orient}), $$pAnnot{offset}, $$pAnnot{end},
        &SignChar ($$pAnnotNgb{orient}), $$pAnnotNgb{offset}, $$pAnnotNgb{end};
      next;
    }
    $intgen{range} = sprintf ('%s..%s', $intgen{offset}, $intgen{end});
    if ($bNonred and $SeqRangeHad{$intgen{range}}) { next }
    $SeqRangeHad{$intgen{range}} = 1;
    if ($intgen{orient} < 0) {
      $intgen{range} = "complement($intgen{range})";
    }
    $intgen{categ} = sprintf ('%s5', ($$pAnnotNgb{type} ne 'gene') ? '0' :
      $SyntaxTranslNtdrc{End2PrimeNum}{ $$pAnnotNgb{orient} * $intgen{orient} }
      );

    # output sequence fragment
    if ($debug) {
      printf STDERR "%s. extracting intergenic range\n", &MySub;
      printf STDERR "  polII gene: %s %d..%d\n  neighbour %s %d..%d (type %s)\n",
        &SignChar ($$pAnnot{orient}), $$pAnnot{offset}, $$pAnnot{end},
        &SignChar ($$pAnnotNgb{orient}), $$pAnnotNgb{offset}, $$pAnnotNgb{end}, $$pAnnotNgb{type};
      printf STDERR "  intergenic: %s (category %s)\n", $intgen{range}, $intgen{categ};
    }
    $pSeqSub = &SeqCplxRange ($pSeq, $intgen{range});
    $$pSeqSub{id} .= sprintf ('_inter_CDS_%d', ++$CtSeqOut);
    $$pSeqSub{descr} .= sprintf (', category %s', $intgen{categ});
    delete $$pSeqSub{header};
    $ProgParam{store}{queue}{SeqOut}->Push($pSeqSub);
  }
}


# intergenic regions from Dictyostelium genome
#
# INTERFACE
# - argument 1: reference on sequence data structure
#
# DESCRIPTION
# - function works fine with GenBank-formatted sequence - don't know about
#   other formats.
# - intergenic regions are output in the orientation as they reside in the
#   genome.
#
sub ProgIntgenDictyCatseq {
  my ($pSeq) = @_;
  my ($debug, $bNonred);
  my (@CdsAnnot, $CtAnnot, $pAnnot, $pAnnotNgb);
  my (%intgen, $pSeqSub, $CtSeqOut);

  # function parameters
  $debug = $ProgOpt{-debug};

  # metaformat annotations
  &AnnotRangeRes ($$pSeq{annot});
  &AnnotExpandTscr ($pSeq);

  # refine structure of CDS annotations
  @CdsAnnot = map { $_->{CDS} ? $_->{CDS} : () }
    values %{$$pSeq{AnnotGrp}};
  $debug and printf STDERR "%s. CDS annotations: %d\n", &MySub, int @CdsAnnot;
  if (int (@CdsAnnot) <= 1) { return }
  foreach $pAnnot (@CdsAnnot) {
    unless ($$pAnnot{orient}) {
      $$pAnnot{orient} = ($$pAnnot{range} =~ m/complement/) ? -1 : +1;
      $$pAnnot{offset} = ($$pAnnot{range} =~ m/(\d+)/)[0];
      $$pAnnot{end} = ($$pAnnot{range} =~ m/.*\b(\d+)/)[0];
    }
  }
  @CdsAnnot = sort { $$a{offset}<=>$$b{offset} } @CdsAnnot;

  # loop over protein-coding gene annotations
  for ($CtAnnot=0; ($CtAnnot+1)<@CdsAnnot; $CtAnnot++) {
    $pAnnot = $CdsAnnot[$CtAnnot];
    $pAnnotNgb = $CdsAnnot[$CtAnnot+1];
    if ($$pAnnotNgb{offset} - $$pAnnot{end} < 2) { next }

    # construct complex range definition
    %intgen = (
      orient => 1,
      offset => $$pAnnot{end}+1,
      end    => $$pAnnotNgb{offset}-1,
      );
    $intgen{range} = sprintf ('%s..%s', $intgen{offset}, $intgen{end});

    # output sequence fragment
    if ($debug) {
      printf STDERR "%s. extracting intergenic range\n", &MySub;
      printf STDERR "  gene: %s %d..%d\n  neighbour %s %d..%d\n",
        &SignChar ($$pAnnot{orient}), $$pAnnot{offset}, $$pAnnot{end},
        &SignChar ($$pAnnotNgb{orient}), $$pAnnotNgb{offset}, $$pAnnotNgb{end};
      printf STDERR "  intergenic: %s\n", $intgen{range};
    }
    $pSeqSub = &SeqCplxRange ($pSeq, $intgen{range});
    $$pSeqSub{id} .= sprintf ('_inter_CDS_%d', ++$CtSeqOut);
    $$pSeqSub{descr} .= sprintf (', orient %s%s',
      $SyntaxTranslNtdrc{End2PrimeNum}{ &Sign($$pAnnot{orient}) },
      $SyntaxTranslNtdrc{End2PrimeNum}{ &Sign($$pAnnotNgb{orient}*-1) } );
    $ProgParam{store}{queue}{SeqOut}->Push($pSeqSub);
  }
}


# extract 1st introns of transcript annotations
#
# INTERFACE
# - argument 1: reference on sequence data structure
#
# DESCRIPTION
# - final extraction and output is done in &ProgAnnotCatseq
#
sub ProgIntron1stCatseq {
  my ($pSeq) = @_;
  my ($debug, $pAnnotListBak);
  my ($pGrp, @SlcIntron, %DelIdx);

  # function parameters
  $debug = $ProgOpt{-debug};

  # enrich annotation in sequence
  #&AnnotRangeRes ($pSeq);
  &AnnotExpandTscr ($pSeq);

  # loop over transcript annotation groups
  foreach $pGrp (values %{$$pSeq{AnnotGrp}}) {
    if ($$pGrp{type} ne 'TCS') { next }

    # select introns belonging to transcript
    @SlcIntron = sort { $$a{orient} * ($$a{offset}<=>$$b{offset}); }
      grep { $_->{type} eq 'intron' and $_->{group} eq $$pGrp{group} }
      @{$$pSeq{annot}};
    shift @SlcIntron;
    foreach (@SlcIntron) { $DelIdx{"$_"}=1; }
  }

  # delete non-first introns
  $debug  and printf STDERR "%s. %d elementary annotations prior to selection, %d to delete\n", &MySub,
    int @{$$pSeq{annot}}, int (keys %DelIdx);
  for (my $CtI=0; $CtI<@{$$pSeq{annot}}; $CtI++) {
    if ($DelIdx{"$$pSeq{annot}[$CtI]"}) {
      splice @{$$pSeq{annot}}, $CtI, 1;
      $CtI --; next;
    }
  }

  # extraction and output as for normal annotation
  $pAnnotListBak = $ProgOpt{AnnotList};
  $ProgOpt{AnnotList} = [ 'intron' ];
  &ProgAnnotCatseq ($pSeq);
  $ProgOpt{AnnotList} = $pAnnotListBak;
}


# report on coding potential of sequence input
#
# INTERFACE
# - global options:
#   -debug      [STD]
#   -SlcValid   include CDS data which has proven erroneous
#
# DESCRIPTION
# - this function is very similar to GscjGap.pl -GeneReport
# - see &SeqLab::SeqAnalysis::CdsStruct for support of file formats
#   and corresponding specific data structure architecture
# - we will provide purified input sequences to
#   &SeqLab::SeqAnalysis::CdsStruct in order to get gap-free CDS
#   output.
#
sub ProgCatCds {
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;

  # get CDS report data structure
  my $pCdsStruct = &CdsStruct ($ProgParam{store}{queue}{SeqIn},
    -IncError => (defined($ProgOpt{-SlcValid}) and ! $ProgOpt{-SlcValid}) ? 1:undef,
    -debug    => $dbg2);

  # output CDS sequences (currently, fastA only)
  print  $$pCdsStruct{SeqCds};
}


################################################################################
# sequence analysis
################################################################################


# plot low complexity score
#
# INTERFACE
# - global options:
#   -debug        [STD]
#   -OutImg       [STD]
#   -OutImgWidth  [STD]
#   -timer        [STD]
#
# DESCRIPTION
# - low complexity is defined as repetition of small sequence units
#   It's mostly equivalent to the presense of poly(A), microsatellite, or
#   minisatellite elements.
# - in contrast to &ProgLowcplxMask and &ProgLowcplxAnnot, work is done solely
#   in perl, here.
#
sub ProgLowcplx {
  my ($UnitArg, @UnitSet, $debug, $dbg2, $bTimer, $time);
  my ($pSeq, $iSeqLen);
  my (@smb, $CtPos, $ItSmb, $ItUnit, $CtCol, %correl);
  my (%img, %graph);

  ##############################################################################
  # pre-work, loop over sequences

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $bTimer = $ProgOpt{-timer};
  $UnitArg = $ProgOpt{-LowcplxUnit} || $ProgParam{default}{LowcplxUnit};
  $UnitArg =~ s/^\d+$/1..$&/;
  @UnitSet = ( eval $UnitArg );
  unless (@UnitSet) {
    printf STDERR "%s. ERROR: undefined repeat unit set from arg '%s'\n", &MySub, $UnitArg;
    exit 1;
  } else {
    $debug and printf STDERR "%s. repeat unit set: %s\n", &MySub, join (' ', @UnitSet);
  }

  $img{width} = $ProgOpt{-OutImgWidth} || $ProgParam{default}{OutImgWidth};
  $img{RelHigh} = $ProgOpt{-OutImgRelhigh} || $ProgParam{default}{OutImgRelhigh}{lowcplx};

  # loop over sequences
  # - we got purified sequence from SeqStreamIn object
  while ($pSeq = &SeqQueue()) {
    $iSeqLen = length $$pSeq{sequence};

    # smoothening parameters
    $img{SmoothWin} = $ProgOpt{-WinSize} || $ProgParam{default}{WinSize}{lowcplx};
    $img{SmoothStep} = 0.5 * $iSeqLen / $img{width};

  ##############################################################################
  # analyse symbols and tuples

    # sample correlation data
    # for default $UnitArg this is faster than grabbing symbols by
    #   substr($$pSeq{sequence},$CtPos,1). However, this way we need some
    #   more memory (for @smb)
    $bTimer and $time = (times)[0];
    @smb = split (//, $$pSeq{sequence});
    for ($CtPos=0; $CtPos<($iSeqLen-1); $CtPos++) {
      $ItSmb = $smb[$CtPos];
      foreach $ItUnit (@UnitSet) {
        push @{$correl{orig}[$ItUnit]},
          ($ItSmb eq $smb[$CtPos+$ItUnit]);
      }
    }
    $bTimer and printf STDERR "%s. CPU time for correlation analysis: %.3f\n", &MySub, (times)[0]-$time;

    # we might have to correct sequence positions to the center of the correlation
    # window rather than the left edge of the window

    # smoothened plots
    $bTimer and $time = (times)[0];
    foreach $ItUnit (@UnitSet) {
      $debug and printf STDERR "%s. calculating smoothened data: repeat unit %d, step %s, window %s\n", &MySub,
        $ItUnit, $img{SmoothStep}, $img{SmoothWin};
      $correl{Plot2D}[$ItUnit] = Math::Plot2D->new($correl{orig}[$ItUnit], -TabType=>'A1y', -extrapolate=>'mean');
    }
    $bTimer and printf STDERR "%s. CPU time for creating plot objects: %.3f\n", &MySub, (times)[0]-$time;

  ##############################################################################
  # prepare image

    # sample data
    %graph = (
      BgTranspar => $ProgOpt{-OutImgTransp},
      plot => [ ],
      scale => [
          { PlotNum  => 0,
            location => 'bottom',
          },
          { PlotNum  => 0,
            location => 'left',
          },
          { PlotNum  => 0,
            location => 'right',
          },
        ],
      );
    $bTimer and $time = (times)[0];
    undef $CtCol;
    foreach $ItUnit (@UnitSet) {
      push  @{$graph{plot}}, {
        DataType  => 'AA',
        data      => $correl{Plot2D}[$ItUnit]->SmoothPlot ($img{SmoothStep}, -window=>$img{SmoothWin}, -debug=>$dbg2),
        DataRange => {
          x => [ 0, $iSeqLen ],
          y => [ 0, 1 ],
          },
        ReprType  => 'line',
        ReprColor => $ColorFlute[int($CtCol)],
        };
      $CtCol ++;
    }
    $bTimer and printf STDERR "%s. CPU time for plot smoothening: %.3f\n", &MySub, (times)[0]-$time;
    $graph{plot}[0]{DimPixel}  = { x=>$img{width}, y=>0 };
    $graph{plot}[0]{HeightRel} = $img{RelHigh};

    # create and save image
    $bTimer and $time = (times)[0];
#    $img{PathDat} = &PathUnique (-name=>&PrepOstump($pSeq,-stamp=>'_lowcplx').'#.dat', -NoSize=>1, -touch=>1);
#    &WriteFile ($img{PathDat}, &DataPrint (\%graph));
#    print  "image data saved to $img{PathDat}\n";
    $img{path} = $ProgOpt{-OutImg} || &PathUnique (-name=>&PrepOstump($pSeq,-stamp=>'_lowcplx').'#.png', -NoSize=>1, -touch=>1);
    unless (&Graph (\%graph, -save=>$img{path}, -timer=>$bTimer, -debug=>$dbg2)) {
      printf STDERR "ERROR: unable to save image to %s\n", $img{path}|"''";
      undef $img{path};
    } else {
      print  "image saved to $img{path}\n";
    }
    $bTimer and printf STDERR "%s. CPU time for plot image preparation: %.3f\n", &MySub, (times)[0]-$time;

  }  # end seq loop
}


# annotate sequence ranges of low complexity
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
# - global options:
#   -debug      [STD]
#   -LowcplxID   program mode argument: minimum repetitive correlation
#               throughout range
#   -LowcplxLen  program mode argument: minimum length of range
#
# - global data:
#   $ProgParam{default}{AnnotLbl}{lowcplx}
#
# DESCRIPTION
# - mask both read-anchored and contig-anchored annotations
#
sub ProgLowcplxAnnot {
  my ($pSeq) = @_;
  my ($debug, $dbg2, $AnnotLabel);
  my ($PathSeq, $PathErr, $CallArgID, $CallArgLen, $sCall);
  my ($pTable, $pAnnot, $pRangeGapped);

  # function parameters
  $debug = $ProgOpt{-debug} || 0;
  $dbg2  = $debug ? $debug-1 : undef;
  $AnnotLabel = $ProgOpt{-AnnotLbl} || $ProgParam{default}{AnnotLbl}{lowcplx};

  # save sequence in table format
  # sequence string has been purified in SeqStreamIn
  $PathSeq = $ProgParam{TmpManag}->Create(-touch=>1);
  $PathErr = $ProgParam{TmpManag}->Create(-touch=>1);
  $$pSeq{SeqPure} ||= &SeqStrPure ($$pSeq{sequence});
  &WriteFile ($PathSeq, &SeqentryToFFmt ($pSeq, -KeySeq=>'SeqPure', -format=>'table'));

  # call binary to locate low-complexity regions
  # read list of hits
  $CallArgID  = $ProgOpt{-LowcplxID} ? "-i $ProgOpt{-LowcplxID}" : '';
  $CallArgLen = $ProgOpt{-LowcplxLen} ? "-l $ProgOpt{-LowcplxLen}" : '';
  $sCall = "$CorePath{call}{LowCplx} $CallArgID $CallArgLen < $PathSeq 2> $PathErr";
  $debug and printf STDERR "%s. calling: $sCall\n", &MySub;
  $pTable = &PlainToTable ("$sCall |",
    -TabType     => 'AH',
    -ColLabel => [qw(id unit cons offset end score RelID)],
    -comments    => 1,
    -debug       => $dbg2);
  if (-s $PathErr) {
    printf STDERR "%s. ERROR message from %s, seq %s\n", &MySub,
      $CorePath{call}{LowCplx}, $$pSeq{id};
    print  STDERR "  call was: $sCall\n";
    print  STDERR &ReadFile ($PathErr);
  }
  unlink ($PathSeq, $PathErr);
  if (-s $PathErr) { return }
  $debug>1 and &DataPrint ($pTable, -handle=>\*STDERR);

  # delete existing annotations with current label
  $debug and printf STDERR "%s. deleting existing annotations labelled $AnnotLabel\n", &MySub;
  @{$$pSeq{annot}} = grep { $_->{type} ne $AnnotLabel } @{$$pSeq{annot}};

  # loop over low-complexity regions
  foreach $pAnnot (@$pTable) {
    $debug and printf STDERR "%s. found low-complexity region, seq %s, range %d..%d (ungapped)\n", &MySub,
      $$pSeq{id}, $$pAnnot{offset}, $$pAnnot{end};

    # annotate low-complexity regions on gapped sequence string
    $pRangeGapped = &SeqRangeGapped ($$pSeq{SeqPure}, $$pSeq{sequence}, [$$pAnnot{offset}, $$pAnnot{end}]);
    push @{$$pSeq{annot}}, {
      type   => $AnnotLabel,
      orient => 0,
      offset => $$pRangeGapped{-1},
      end    => $$pRangeGapped{1},
      text   => sprintf ("SimpleRepeat %s\nrel_identity=%.3f",
                $$pAnnot{cons}, $$pAnnot{RelID}),
      };
  }

  # output sequence and return
  $ProgParam{store}{queue}{SeqOut}->Push($pSeq);
}


# mask sequence ranges of low complexity
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
# - global options:
#   -debug      [STD]
#   -LowcplxID   program mode argument: minimum repetitive correlation
#               throughout range
#   -LowcplxLen  program mode argument: minimum length of range
#
# - global data:
#   $ProgParam{MaskChar}
#
# DESCRIPTION
# - mask both read-anchored and contig-anchored annotations
#
sub ProgLowcplxMask {
  my ($pSeq) = @_;
  my ($debug, $dbg2, $SmbMask, $CallArgID, $CallArgLen);
  my ($PathSeq, $PathErr, $sCall);
  my ($pTable, $pAnnot, $pRangeGapped);

  # function parameters
  $debug = $ProgOpt{-debug} || 0;
  $dbg2  = $debug ? $debug-1 : undef;
  $SmbMask = $ProgParam{MaskChar};
  $CallArgID  = $ProgOpt{-LowcplxID} ? "-i $ProgOpt{-LowcplxID}" : '';
  $CallArgLen = $ProgOpt{-LowcplxLen} ? "-l $ProgOpt{-LowcplxLen}" : '';

  # save sequence in table format
  # sequence string has been purified in SeqStreamIn
  $PathSeq = $ProgParam{TmpManag}->Create(-touch=>1);
  $PathErr = $ProgParam{TmpManag}->Create(-touch=>1);
  $$pSeq{SeqPure} ||= &SeqStrPure ($$pSeq{sequence});
  &WriteFile ($PathSeq, &SeqentryToFFmt ($pSeq, -KeySeq=>'SeqPure', -format=>'table'));

  # call binary to locate low-complexity regions
  # read list of hits
  $sCall = "$CorePath{call}{LowCplx} $CallArgID $CallArgLen < $PathSeq 2> $PathErr";
  $debug and printf STDERR "%s. calling: $sCall\n", &MySub;
  $pTable = &PlainToTable ("$sCall |",
    -TabType     => 'AH',
    -ColLabel => [qw(id unit cons offset end score RelID)],
    -comments    => 1,
    -debug       => $dbg2);
  if (-s $PathErr) {
    printf STDERR "%s. ERROR message from %s, seq %s\n", &MySub,
      $CorePath{call}{LowCplx}, $$pSeq{id};
    print  STDERR "  call was: $sCall\n";
    print  STDERR &ReadFile ($PathErr);
  }
  unlink ($PathSeq, $PathErr);
  if (-s $PathErr) { return }
  $debug>1 and &DataPrint ($pTable, -handle=>\*STDERR);

  # loop over low-complexity regions
  # - render gapped sequence
  if (length($$pSeq{SeqPure}) != length($$pSeq{sequence})) {
    foreach $pAnnot (@$pTable) {
      $debug and printf STDERR "%s. found low-complexity region, seq %s, range %d..%d (ungapped)\n", &MySub,
        $$pSeq{id}, $$pAnnot{offset}, $$pAnnot{end};

      # mask low-complexity regions on gapped sequence string
      $pRangeGapped = &SeqRangeGapped ($$pSeq{SeqPure}, $$pSeq{sequence}, [$$pAnnot{offset}, $$pAnnot{end}]);
      substr ($$pSeq{sequence}, $$pRangeGapped{-1}-1, $$pRangeGapped{1}-$$pRangeGapped{-1}+1)
        =~ s/[a-zA-Z]/$SmbMask/g;
    }
  }
  # - render ungapped sequence
  else {
    foreach $pAnnot (@$pTable) {
      substr ($$pSeq{sequence}, $$pAnnot{offset}-1, $$pAnnot{end}-$$pAnnot{offset}+1)
        = $SmbMask x ($$pAnnot{end}-$$pAnnot{offset}+1);
    }
  }

  # output sequence and return
  $ProgParam{store}{queue}{SeqOut}->Push($pSeq);
}


# select HAPPY marker sites
#
# INTERFACE
# - argument 1:   reference to sequence data structure
#                 sequence is purified via &SeqQueue
#
# - global options:
#   -debug        [STD]
#   -SlcEnds      select marker sites from both ends if sequence size > 2x this
#                 value, default: $ProgParam{default}{SiteHappyEnds}
#                 NOTE: Switch -SlcEnds in object SeqStreamIn must be turned off!
#   -WinSize      size of appropriate sequence window
#
sub ProgSiteHappy {
  my (@SiteParamSet);
  my ($pSeq) = @_;
  my ($debug, $dbg2, $SlcEnds, $SafetyEnds, $SafetyMkFail, $iSeqLen);
  my ($AnnotNum, $pAnnot);
  my (@SeqRange, @SeqRangePick, $SeqPos, $pSeqSearch,
      $ItSet, $pSeqMarker, $CtMarker);

  # function constants
  # try site parameter sets of descending order of stringency
  @SiteParamSet = ( {
    -idnum      => 1,
    -GcExtreme  => 0.30, # stringent
    -PrimerSize => 150,  # stringent
    -ProdMin    => 300,
    -ProdMax    => 1000,
    }, {
    -idnum      => 2,
    -GcExtreme  => 0.25, # loose
    -PrimerSize => 150,  # stringent
    -ProdMin    => 300,
    -ProdMax    => 1000,
    }, {
    -idnum      => 3,
    -GcExtreme  => 0.25, # loose
    -PrimerSize => 100,  # less stringent
    -ProdMin    => 300,
    -ProdMax    => 1000,
    });

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $SlcEnds = $ProgOpt{-SlcEnds} || $ProgParam{default}{SiteHappyEnds};

  $SafetyEnds = 70;
  $SafetyMkFail = 200;
  $$pSeq{id} =~ s/-[35]$//;
  $$pSeq{descr} =~ s/^scaffold/"from scaffold $$pSeq{id}"/e;
  $iSeqLen = length ($$pSeq{sequence});
  $debug and printf STDERR "%s. sequence %s, length %d\n", &MySub,
    $$pSeq{id}, $iSeqLen;

  ##############################################################################
  # look out for existing markers
  # - marker locations are assumed to be annotated using annotation label 'MARK'
  #   and text containing the marker ID starting with "DH"
  # - report existing marker locations
  # - turn site of failed marker into masked sequence range

  for ($AnnotNum=0; $AnnotNum<@{$$pSeq{annot}}; $AnnotNum++) {
    $pAnnot = $$pSeq{annot}[$AnnotNum];
    if ($$pAnnot{type} ne 'MARK') { next }
    if ($$pAnnot{text} !~ m/^DH\w+/) { next }

    # extract marker information from marker annotation
    $$pAnnot{text} =~ m/^DH\w+/ and $$pAnnot{marker} = $&;
    $$pAnnot{text} =~ m/description=.*(chr\d+)/i and $$pAnnot{MapChr} = $1;
    $$pAnnot{text} =~ m/description=.*?(-?\d+\.\d{3})\b/ and $$pAnnot{MapPos} = $1;
    $$pAnnot{text} =~ m/description=from(\S+)/ and $$pAnnot{MapFrom} = $1;
    printf STDERR "WARNING: sequence %s already contains HAPPY marker %s from %s (Chr %s, pos. %s) at %d..%d\n",
      $$pSeq{id}, $$pAnnot{marker}, $$pAnnot{MapFrom},
      $$pAnnot{MapChr} ? $$pAnnot{MapChr}:'?', $$pAnnot{MapPos} ? $$pAnnot{MapPos}:'?',
      $$pAnnot{offset}, $$pAnnot{end};

    # failed marker:
    # - turn marker annotation into SMALL masked sequence range
    if ($$pAnnot{MapPos} < 0 ) {
      @SeqRange = (&Max (1, $$pAnnot{offset}-$SafetyMkFail),
        &Min ($$pAnnot{end}+$SafetyMkFail, length $$pSeq{sequence}));
      printf STDERR "  masking marker range (failed): %d..%d\n", @SeqRange;
      substr ($$pSeq{sequence}, $SeqRange[0]-1, $SeqRange[1]-$SeqRange[0]+1) =~ s/\w/N/g;
    }

    # passed marker:
    # - turn marker annotation into LARGE masked sequence range
    else {
      @SeqRange = (&Max (1, $$pAnnot{offset} - 0.67*$SlcEnds),
        &Min ($$pAnnot{end} + 0.67*$SlcEnds, length $$pSeq{sequence}));
      printf STDERR "  masking marker range (passed): %d..%d\n", @SeqRange;
      substr ($$pSeq{sequence}, $SeqRange[0]-1, $SeqRange[1]-$SeqRange[0]+1) =~ s/\w/N/g;
    }
  }

  ##############################################################################
  # scan for markers

  # loop over marker
  $CtMarker = 0;
  for ($SeqPos=0; $SeqPos<length($$pSeq{sequence}); $SeqPos+=$SlcEnds) {

    # pick sequence ranges of $SlcEnds size
    @SeqRange = ($SeqPos ? $SeqPos:$SafetyEnds, &Min ($SeqPos + $SlcEnds, length ($$pSeq{sequence})));
    $debug and printf STDERR "%s. marker selection at range: %d..%d\n", &MySub, @SeqRange;
    $pSeqSearch = &SeqRange ($pSeq, @SeqRange, -debug=>$dbg2);
    $$pSeqSearch{id} .= $CtMarker ? "am$CtMarker" : 'al';

    # loop over search parameter sets - try to select marker
    foreach $ItSet (@SiteParamSet) {
      @SeqRangePick = &SitePcrmarkerRange ($pSeqSearch, %$ItSet);
      $SeqRangePick[0] and last;
    }

    # prepare marker site sequence, output sequence
    if ($SeqRangePick[0]) {
      $debug and printf STDERR "  found PCR marker site %s using set %d at range %d..%d\n",
        $$pSeqSearch{id}, $$ItSet{-idnum}, $SeqRange[0]+$SeqRangePick[0], $SeqRange[0]+$SeqRangePick[1];
      $pSeqMarker = &SeqClipUnk (&SeqRange (
        { sequence => $$pSeqSearch{sequence},
          id       => $$pSeqSearch{id},
          descr    => $$pSeqSearch{descr} },
        $SeqRangePick[0], $SeqRangePick[1],
        -debug => $dbg2));
      $ProgParam{store}{queue}{SeqOut}->Push($pSeqMarker);
      $CtMarker ++;
      $SeqPos += $SeqRangePick[1];
    }

    # no marker site
    else {
      $debug and printf STDERR "  no PCR marker site found, seq %s, length %d\n",
        $$pSeqSearch{id}, length ($$pSeqSearch{sequence});
    }

  } # end: sequence range loop
}


# select PCR marker range from sequence
#
# INTERFACE
# - argument 1:   reference to sequence data structure
#                 pure sequence string is expected
#
# - options:
#   -debug        [STD]
#   -GcExtreme    extreme mean G/C content for primer site - threshold criterion
#   -PrimerSize   size of sequence window fulfilling primer site criteria
#   -ProdMin      minimal size of PCR product (2 * primer site + primer distance)
#   -ProdMax      minimal size of PCR product (2 * primer site + primer distance)
#
# - return val:   - array of range offset, range end
#                 - undef if an error occurs
#
sub SitePcrmarkerRange {
  my ($pSeq, %opt) = @_;
  my ($debug, $iSeqLen);
  my (%win, $SeqSlice, $bSuccess);

  # function parameters
  $debug = $opt{-debug};
  $iSeqLen = length $$pSeq{sequence};

  $debug and printf STDERR "%s. scanning sequence %s for marker sites\n", &MySub, $$pSeq{id}||"''";
  $debug and printf STDERR "  sequence length: %d\n", $iSeqLen;
  %win = (
    PrimerSize => $opt{-PrimerSize} || $ProgParam{default}{SitePrimerSize},
    GCmin      => $opt{-GcExtreme} || $ProgParam{default}{SitePrimerGcExtreme},
    GCmax      => 1 - ($opt{-GcExtreme} || $ProgParam{default}{SitePrimerGcExtreme}),
    );
  $win{PrimerDistMin} = ($opt{-ProdMin} || $ProgParam{default}{SitePcrprodMin});
  $win{PrimerDistMax} = ($opt{-ProdMax} || $ProgParam{default}{SitePcrprodMax}) - 2 * $win{PrimerSize};
    # add later: pos1, stop1, GC1, GC1match, pos2, stop2, GC2, GC2match

  # 1st primer site
  $win{pos1} = 1;
  $win{stop1} = $iSeqLen - (2 * $win{PrimerSize} + $win{PrimerDistMin});
  $win{GC1} = 0;
  SitePcrmarker: while ($win{pos1} <= $win{stop1}) {
    substr ($$pSeq{sequence}, $win{pos1}-1, $win{PrimerSize}) =~ m/^[atAT]+/
      and $win{pos1} += length ($&);
    $SeqSlice = substr ($$pSeq{sequence}, $win{pos1}-1, $win{PrimerSize});
    $win{GC1} = &Sum (@{ &SeqCodeCount ($SeqSlice) }{'G','C'}) / $win{PrimerSize};
    $win{GC1match} = ($win{GC1}>=$win{GCmin} and $win{GC1}<=$win{GCmax}) ? 1 : 0;
    $debug and printf STDERR "  pos %d, G/C %.2f, G/C match %d%s\n",
      $win{pos1}, $win{GC1}, $win{GC1match}, $win{GC1match} ? ", seq $SeqSlice":'';

    # 2nd primer site
    if ($win{GC1match}) {
      $win{pos2} = $win{pos1} + $win{PrimerSize} + $win{PrimerDistMin};
      $win{stop2} = &Min ($win{pos1} + $win{PrimerSize} + $win{PrimerDistMax}, $iSeqLen - ($win{PrimerSize}));
      $win{GC2} = 0;
      while ($win{pos2} <= $win{stop2}) {
        substr ($$pSeq{sequence}, $win{pos2}-1, $win{PrimerSize}) =~ m/^[atAT]+/
          and $win{pos2} += length ($&);
        $SeqSlice = substr ($$pSeq{sequence}, $win{pos2}-1, $win{PrimerSize});
        $win{GC2} = &Sum (@{ &SeqCodeCount ($SeqSlice) }{'G','C'}) / $win{PrimerSize};
        $win{GC2match} = ($win{GC2} >= $win{GCmin} and $win{GC2} <= $win{GCmax}) ? 1 : 0;
        $debug and printf STDERR "    pos %d, G/C %.2f, G/C match %d\n",
          $win{pos2}, $win{GC2}, $win{GC2match}, $win{GC2match} ? ", seq $SeqSlice":'';

        # found 2nd site or move 2nd window
        if ($win{GC2match}) {
          last SitePcrmarker;
        } else {
          $win{pos2} += int &Max ((&Max ($win{GCmin} - $win{GC2}, $win{GC2} - $win{GCmax})) * 1.25 * $win{PrimerSize}, 2);
        }
      }
      $debug and print  STDERR "    reached end of window\n";
    }

    # move 1st window
    $win{pos1} += int &Max ((&Max ($win{GCmin} - $win{GC1}, $win{GC1} - $win{GCmax})) * 1.25 * $win{PrimerSize}, 2);
  }

  # return marker sequence range
  if ($win{GC2match}) {
    return ($win{pos1}, $win{pos2} + $win{PrimerSize}),
  } else {
    $debug and printf STDERR "%s. no marker site found, ID %s\n", &MySub, $$pSeq{id}||"''";
    return;
  }
}


# perform ORF analysis
#
# INTERFACE
# - argument 1:   reference to sequence data structure
#
# - global options:
#   -debug        [STD]
#   -OutImg       [STD]
#   -OutImgWidth  [STD]
#   -OutTab       [STD]
#   -OutTabFmt    [STD]
#   -SlcFrame     frames to be regarded in the analysis
#   -SlcLen       select for minimum protein length of ORF (tabular output)
#
sub ProgOrf {
  my ($pSeq) = @_;
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;
  my $SlcLen = $ProgOpt{-SlcLen};
  my $iSeqLen = length $$pSeq{sequence};
  my @frame = defined(@{$ProgOpt{-SlcFrame}}) ?
    @{$ProgOpt{-SlcFrame}} : ('+1','+2','+3','-1','-2','-3');

  my %img;
  $img{flag} = defined ($ProgOpt{-OutImg}) || $ProgOpt{-OutImgWidth} || $ProgOpt{-OutImgType};
  $img{width} = $ProgOpt{-OutImgWidth} || $ProgParam{default}{OutImgWidth};
  $img{RelHigh} = $ProgOpt{-OutImgRelhigh} || $ProgParam{default}{OutImgRelhigh}{orf};
  $img{ColPen} = &ColorFormat('black');

  ########################################################################
  # analysis
  my (%result,@report,%graph);

  # loop over translation frames
  foreach my $CtFrame (@frame) {

    # analyse for all that stuff like start/stop positions etc.
    # derived positions refer to the plus-stranded sequence
    my ($pStart,$pCodon,$pProbab) = &TranslStartArray ($$pSeq{sequence}, -frame=>$CtFrame, -debug=>$dbg2);
    my ($pStop,$SeqTransl) = &TranslStopArray ($$pSeq{sequence}, -frame=>$CtFrame, -debug=>$dbg2);
    $result{$CtFrame} = {
      start   => $pStart,
      probab  => $pProbab,
      codon   => $pCodon,
      SeqProt => $SeqTransl,
      };

    # produce array of possible ORFs, calculate ORF properties
    # add 5' truncated ORF
    if ($pStart and int(@$pStart) and $$pStart[0]>3) {
      $SeqTransl =~ m/^([a-z]*)(\*)?.*$/i;
      $1 and push @report, {
        frame   => $CtFrame,
        start   => ($CtFrame<0)? $iSeqLen+$CtFrame+1 : int $CtFrame,
        probab  => 0,
        codon   => 'truncated',
        SeqProt => $1,
        LenProt => length $1,
        end     => $2 ? 'stop':'truncated',
        };
    }
    for (my $CtI=0; $CtI<int(@$pStart); $CtI++) {
      my $StrProtPos = ($CtFrame<0)? $iSeqLen-$$pStart[$CtI]+1 : $$pStart[$CtI];
      $StrProtPos -= abs ($CtFrame);
      $StrProtPos /= 3;
      substr($SeqTransl,$StrProtPos) =~ m/^([a-z]+)(\*)?.*$/i;
      push @report, {
        frame   => $CtFrame,
        start   => $$pStart[$CtI],
        probab  => $$pProbab[$CtI],
        codon   => $$pCodon[$CtI],
        SeqProt => $1,
        LenProt => length $1,
        end     => $2 ? 'stop':'truncated',
        };
    }

  ########################################################################
  # image representation

    # prepare plot subgraph
    if ($img{flag}) {
      %graph = (
        BgColor    => 'white',
        BgTranspar => $ProgOpt{-OutImgTransp},
        plot => [
          { DimPixel     => { x=>$img{width} },
            HeightRel    => $img{RelHigh},
            DataType     => 'HCA',
            data         => { x=>$pStart, y=>$pProbab },
            DataRange    => {
              x => [ 0, $iSeqLen ],
              y => [ 0, 1 ],
              },
            ReprType     => 'column',
            ReprColor    => 'green',
          },
          { DataType     => 'MapPos',
            data         => { x=>$pStop, },
            DataRange    => {
              x => [ 0, $iSeqLen ],
              y => [ 0, 1 ],
              },
            ReprType     => 'column',
            ReprColor    => 'red',
          },
          ],
        scale => [
          { PlotNum  => 0,
            location => 'x',
            color    => 'black',
          },
          ],
        );
      unless (&Graph (\%graph, -debug=>$dbg2)) {
        printf "%s. ERROR: failure in making plot graph, frame %s\n", &MySub, $CtFrame;
      }
      %{$img{$CtFrame}} = %graph;
      $img{height} += int ($graph{plot}[0]{DimPixel}{y}) + 1;
    }

  } # end foreach frame

  ########################################################################
  # output start/stop codon plot

  # finish plot graph
  if ($img{flag}) {

    # create final image: width x height, background, pen color
    # - the last %graph object is still used as parameter source
    $img{height}    += $graph{scale}[0]{DimPixel}{y};
    $img{HeightStep} = int $graph{plot}[0]{DimPixel}{y};
    $img{LLabelFont} = GD::Font->Giant;
    $img{WidthLLabel} = $img{LLabelFont}->width() * 2;
    $img{width} = $graph{width} + $img{WidthLLabel};
    $img{all} = new GD::Image ($img{width}, $img{height});
    $img{PalPen} = $img{all}->colorAllocate(@{&ColorFormat('transparent')});
    $img{all}->transparent ($img{PalPen});
    $img{all}->filledRectangle (0, 0, $img{width}, $img{height}, $img{PalPen});
    $img{PalPen} = $img{all}->colorAllocate (@{$img{ColPen}});

    # loop over frame sub-graphs
    my $CtHeight=0;
    my $CtFrame;
    foreach $CtFrame (@frame) {

      # delimiting line
      $img{all}->line (
        $img{WidthLLabel}+$img{$CtFrame}{PosPlot}{x}, $CtHeight,
        $img{WidthLLabel}+$img{$CtFrame}{PosPlot}{x}+$img{$CtFrame}{plot}[0]{DimPixel}{x}, $CtHeight,
        $img{PalPen});
      $CtHeight += 1;

      # copy plots
      $img{all}->copy ($img{$CtFrame}{img},
        $img{WidthLLabel}+$img{$CtFrame}{PosPlot}{x}, $CtHeight+$img{$CtFrame}{PosPlot}{y},
        $img{$CtFrame}{PosPlot}{x}, $img{$CtFrame}{PosPlot}{y},
        $img{$CtFrame}{plot}[0]{DimPixel}{x}, $img{$CtFrame}{plot}[0]{DimPixel}{y},
        );

      # frame label
      $img{all}->string ($img{LLabelFont},
        1, $CtHeight+$img{HeightStep}*0.5-$img{LLabelFont}->height()*0.5,
        $CtFrame, $img{PalPen});

      $CtHeight += $img{HeightStep};
    }

    # copy scale
    $CtFrame = $frame[$#frame];
    $debug and printf STDERR "%s. scale image generated: %s%s\n", &MySub,
      $img{$CtFrame}{scale}[0]{img} ? 'yes':'no',
      $img{$CtFrame}{scale}[0]{img} ? ", height: $graph{scale}[0]{DimPixel}{y}" : '';
    $img{all}->copy ($img{$CtFrame}{scale}[0]{img},
      $img{WidthLLabel}+$img{$CtFrame}{PosPlot}{x}+$img{$CtFrame}{scale}[0]{PosX}, $CtHeight,
      0, 0, $img{$CtFrame}{scale}[0]{img}->getBounds(),
      );

    # save image
    $img{path} = $ProgOpt{-OutImg} || &PathUnique (-name=>&PrepOstump($pSeq,-stamp=>'_orf').'#.png', -NoSize=>1, -touch=>1);
    &WriteFile ($img{path}, $img{all}->png());

  } # end plot graph

  ########################################################################
  # output sequences

  # output full protein translations as fastA
  if ($ProgOpt{-OutSeq}) {
    foreach my $CtFrame (@frame) {
      $ProgParam{store}{queue}{SeqOut}->Push( {
        id       => $$pSeq{id}."_prot${CtFrame}",
        sequence => $result{$CtFrame}{SeqProt},
        } );
      $debug and printf STDERR "%s. printing protein sequence %s\n", &MySub, $$pSeq{id}."_prot${CtFrame}"||"''";
    }
  }

  ########################################################################
  # output tabular report

  # table header
  &OutTabIntro ("$ProgFile -$ProgMode\n");
  &OutTabIntro (sprintf "time: %s\n", &TimeStr());
  &OutTabIntro (sprintf "sequence ID: %s\n", $$pSeq{id}||"''");
  &OutTabIntro (sprintf "sequence length: %d letters\n", $iSeqLen);
  $img{flag} and
  &OutTabIntro (sprintf "image: %s\n", $img{path});
  &OutTabIntro ("\n");

  # fully translated sequence
  &OutTabIntro ("$$pSeq{sequence}\n");
  foreach my $CtFrame (@frame) {
    my $SeqProt;
    if ($CtFrame > 0) {
      $SeqProt = (' ' x ($CtFrame-1)) . join ('  ', split(//,$result{$CtFrame}{SeqProt}));
    } else {
      $SeqProt = (' ' x (2 + (($iSeqLen+$CtFrame+1) % 3))) . join ('  ', split(//,reverse($result{$CtFrame}{SeqProt})));
    }
    &OutTabIntro ("$SeqProt\n");
  }

  # column labels
  &OutTabHeadln ([qw(frame ntpos_plus start_codon start_probab end_status protein_len protein_seq)]);

  # loop over entries in report data structure
  foreach (sort { $a->{frame} cmp $b->{frame} or
                  $a->{start} <=> $b->{start}; } @report) {

    # select for protein length (sequence string is pure)
    if ($SlcLen) {
      if ($_->{LenProt} < $$SlcLen[0]) { next }
      if ($$SlcLen[1] and $_->{LenProt} > $$SlcLen[1]) { next }
    }

    # output report entry
    &OutTabLine ([
      sprintf ("%s%d", ($_->{frame}>0)?'+':'', $_->{frame}),
      $_->{start},
      $_->{codon},
      $_->{probab},
      $_->{end},
      $_->{LenProt},
      $_->{SeqProt},
      ]);
  }

  # end table
  &OutTabTail();
}


# plot local protein properties
#
# INTERFACE
# - argument 1:   reference to sequence data structure
#
# - global options:
#   -debug        [STD]
#   -OutImg       [STD]
#   -OutImgWidth  [STD]
#
sub ProgProtPlot {
  my ($pSeq) = @_;
  my ($debug, $dbg2);
  my ($iSeqLen, $PObjAcid, $PObjHydrophob, $StepSize, $window);
  my (%img, %graph);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;

  $iSeqLen = length $$pSeq{sequence};
  $img{width} = $ProgOpt{-OutImgWidth} || $ProgParam{default}{OutImgWidth};
  $img{RelHigh} = $ProgOpt{-OutImgRelhigh} || $ProgParam{default}{OutImgRelhigh}{ProtPlot};

  # analysis
  $PObjAcid = Math::Plot2D->new(&ProtPlotAcid($$pSeq{sequence},-debug=>$dbg2),
    -TabType=>'A1y', -debug=>$dbg2);
  $PObjHydrophob = Math::Plot2D->new(&ProtPlotHydrophob($$pSeq{sequence},-debug=>$dbg2),
    -TabType=>'A1y', -debug=>$dbg2);

  # smoothening
  $StepSize = $iSeqLen / $img{width};
  $window = 4;

  # prepare plot graph
  %graph = (
    BgTranspar => $ProgOpt{-OutImgTransp},
    plot => [
      { DimPixel     => { x=>$img{width} },
        HeightRel    => $img{RelHigh},
        DataType     => 'AA',
        data         => $PObjAcid->SmoothPlot ($StepSize, -window=>$window),
        DataRange    => { x=>[0,$iSeqLen] },
        ReprType     => 'line',
        ReprColor    => 'red',
      },
      { DataType     => 'AA',
        data         => $PObjHydrophob->SmoothPlot ($StepSize, -window=>$window),
        DataRange    => { x=>[0,$iSeqLen] },
        ReprType     => 'line',
        ReprColor    => 'blue',
      },
      ],
    scale => [
      { PlotNum  => 0,
        location => 'x',
        color    => 'black',
      },
      { PlotNum  => 0,
        location => 'left',
        color    => 'red',
      },
      { PlotNum  => 1,
        location => 'right',
        color    => 'blue',
      },
      ],
    );

  # save image
  $img{path} = $ProgOpt{-OutImg} || &PathUnique (-name=>&PrepOstump($pSeq,-stamp=>'_ProtPlot').'#.png', -NoSize=>1, -touch=>1);
  unless (&Graph (\%graph, -save=>$img{path}, -debug=>$dbg2)) {
    printf STDERR "ERROR: unable to save image to %s\n", $img{path}|"''";
    undef $img{path};
  } else {
    print  "image saved to $img{path}\n";
  }
}


################################################################################
# sequence set curation and processing
################################################################################


# simple statistics for sequence files
#
# DESCRIPTION
# - this program mode reports statistics the complete sequence input:
#   - number of entries
#   - total sequence length
#   - mean sequence length
#   - sequence type (hidden in report syntax)
# - the sequence type is derived from the last sequence entry
#
sub ProgStatist {
  my ($pSeq, $pSeqLast, $SeqType, $CtSeq, $CtLetter);

  # evaluation loop over sequence entries
  while ($pSeq = &SeqQueue()) {
    $CtSeq ++;
    $pSeqLast = $pSeq;

    # count letters in pure sequences
    $CtLetter += length $$pSeq{sequence};
  }

  # sequence type
  $SeqType = &SeqType ($$pSeqLast{sequence}, -basic=>1);

  # print statistics
  printf "statistical report for sequence file%s %s\n",
    (@ProgArg==1)?'':'s', join(', ',&ListMaxfirst(\@ProgArg,5,-ElemExceed=>'...'));
  printf "sums: %d entries (%d %s)\n", $CtSeq, $CtLetter,
    ($SeqType eq 'protein') ? 'aa residues':'nucleotides';
  printf "average sequence length: %d\n", $CtSeq ? $CtLetter / $CtSeq : 0;
}


# sequence length statistics
#
# INTERFACE
# - global options:
#   -debug        [STD]
#   -OutImg       [STD]. An argument has the same effect as -OutStump=arg.
#                 We do multi-file (incl. multi-image) ouput here!
#   -OutImgWidth  [STD]
#   -OutStump     [STD]
#
# DESCRIPTION
# - the sequence type is derived from the last sequence entry
#
# DEBUG, CHANGES, ADDITIONS
# - nothing at the moment
#
sub ProgStatLen {
  my ($debug, $dbg2, $bTab, %img, %path);
  my ($pSeq, $pSeqLast, $SeqType, $CtSeq, $CtLetter);
  my ($pPlot, $pPlotDistrib, $pPlotCumulNum);
  my ($integral, $xLast, $xCurr, %graph, $hOut);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;

  $bTab = $ProgOpt{-OutTab};

  $ProgOpt{-OutStump} ||= $ProgOpt{-OutImg};
  $img{flag} = defined ($ProgOpt{-OutImg}) || $ProgOpt{-OutImgWidth} || $ProgOpt{-OutImgType};

  ##############################################################################
  # evaluate sequences

  # loop over sequence entries
  while ($pSeq = &SeqQueue()) {
    $CtSeq ++;
    $pSeqLast = $pSeq;

    # determine sequence length
    $$pSeq{length} = length $$pSeq{sequence};
    $CtLetter += $$pSeq{length};
    push @$pPlot, [ log($$pSeq{length})/log(10), $$pSeq{length} ];
  }

  # sequence type
  $SeqType = &SeqType ($$pSeqLast{sequence});

  # print statistics
  printf "\nStatistical report for sequence file%s %s\n",
    (@ProgArg==1)?'s:':'', join(' ',@ProgArg);
  printf "sums: %d entries (%d %s)\n", $CtSeq, $CtLetter,
    ($SeqType eq 'protein') ? 'aa residues':'nucleotides';
  printf "average sequence length: %d\n", $CtSeq ? $CtLetter / $CtSeq : 0;

  # for there's anything to report
  if (@$pPlot) {

    # save list of observed length values
    if ($path{table} = $bTab) {
      &WriteFile ($path{table}, join ('', map { "$_\n" }
        @{${ scalar &TableConvert('AA','AC',$pPlot) }[0]}));
    }

  ##############################################################################
  # prepare plot graphs

    if ($img{flag}) {

      # image parameters
      $img{width} = $ProgOpt{-OutImgWidth} || $ProgParam{default}{OutImgWidth};
      $img{RelHigh} = $ProgOpt{-OutImgRelhigh} || $ProgParam{default}{OutImgRelhigh}{StatLen};
      $path{base} = &PrepOstump ($pSeqLast, -stamp=>'_StatLen');

      # derive X axis dimensions from first plot object
      my $poPlot = Math::Plot2D->new(&DistribEmpir (
        ${ scalar &TableConvert ('AA','AC', $pPlot) }[0],
        -debug=>$dbg2), -debug=>$dbg2);
      unless ($poPlot) {
        print  STDERR "code ERROR: unable to create plot object from lengths sample\n";
        exit 1;
      }
      $img{OffX}  = ($ProgOpt{-SlcLen} and defined $ProgOpt{-SlcLen}[0]) ?
        log($ProgOpt{-SlcLen}[0])/log(10) : $poPlot->Xmin()||0;
      $img{EndX}  = ($ProgOpt{-SlcLen} and defined $ProgOpt{-SlcLen}[1]) ?
        log($ProgOpt{-SlcLen}[1])/log(10) : $poPlot->Xmax()||($img{OffX}+0.01);
      $img{StepX} = ($img{EndX} - $img{OffX}) / $img{width} / 2;
      $img{WinX}  = 20 * ($img{EndX}-$img{OffX}) / $poPlot->Size();

      # calculate entry distribution over length range
      $pPlotDistrib = $poPlot->SmoothPlot ($img{StepX},
        -IntervOff=>$img{OffX}, -IntervEnd=>$img{EndX},
        -window=>$img{WinX});
#      @$pPlotDistrib = map { $_->[1] *= $img{StepX}; $_; } @$pPlotDistrib;
#      $poPlot = Math::Plot2D->new(&DistribEmpir ($pPlotDistrib,
#        -debug=>$dbg2), -debug=>$dbg2);
#      exit 0;
      if ($debug) {
        printf STDERR "%s. distribution plot, size %d\n", &MySub, (@$pPlotDistrib - 2) / 2;
        foreach (@$pPlotDistrib) {
          printf STDERR "%s\t%s\n", $_->[0], $_->[1];
        }
      }

      # plot graph
      %graph = (
        BgTranspar => $ProgOpt{-OutImgTransp},
        plot => [
          { DimPixel     => { x=>$img{width} },
            HeightRel    => $img{RelHigh},
            DataType     => 'HCA',
            data         => { },
            DataRange    => { x=>[$img{OffX}], y=>[0] },
            ReprType     => 'line',
            ReprColor    => 'black',
          },
          ],
        scale => [
          { PlotNum  => 0,
            location => 'bottom',
          },
          { PlotNum  => 0,
            location => 'top',
          },
          { PlotNum  => 0,
            location => 'left',
          },
          { PlotNum  => 0,
            location => 'right',
          },
          ],
        );
      ($graph{plot}[0]{data}{x}, $graph{plot}[0]{data}{y}) =
        @{ scalar &TableConvert ('AA', 'AC', $pPlotDistrib, -debug=>$dbg2) };

      # save plot graph data
      $path{ImgDistribData} = sprintf ('%s_distrib_img.dat', $path{base});
      if ($hOut = FileHandle->new($path{ImgDistribData},'w')) {
        printf "writing file %s\n", $path{ImgDistribData};
      } else {
        printf STDERR "%s. ERROR: unable to write file %s\n", &MySub, $path{ImgDistribData};
      }
      &DataPrint (\%graph, -handle=>$hOut, -debug=>$dbg2);
      # create image
      $path{ImgDistrib} = sprintf ('%s_distrib.png', $path{base});
      if (&Graph (\%graph, -save=>$path{ImgDistrib}, -debug=>$dbg2)) {
        print  "sequence length, distribution plot: $path{ImgDistrib}\n";
      } else {
        printf STDERR "%s. ERROR: failure in making graph (distribution)\n", &MySub;
      }

      # calculate entry cumulation dependent on length
      $poPlot = Math::Plot2D->new(&DistribEmpir (
        scalar &TableConvert ('AC', 'AA', [
          ${ scalar &TableConvert ('AA','AC', $pPlot) }[0],
          [ (1) x $#$pPlot ] ],
        -debug=>$dbg2), -debug=>$dbg2), -debug=>$dbg2);
      $img{StepNum} = $img{width} * 2;
      $img{StepSize} = ($poPlot->Xmax() - $poPlot->Xmin()) / $img{StepNum};
      $xCurr = $xLast = $poPlot->Xmin();
      while ($xCurr <= $poPlot->Xmax()) {
        $integral += $poPlot->Integral ($xLast, $xCurr);
        push @$pPlotCumulNum, [ $xCurr, $integral ];
        $xLast = $xCurr;
        $xCurr += $img{StepSize};
      }
      foreach $_ (@$pPlotCumulNum) { $_->[1] = $integral - $_->[1]; }

      # plot graph - entry cumulation
      %graph = (
        BgTranspar => $ProgOpt{-OutImgTransp},
        plot => [
          { DimPixel     => { x=>$img{width} },
            HeightRel    => $img{RelHigh},
            DataType     => 'AA',
            data         => $pPlotCumulNum,
            DataRange    => { x=>[$img{OffX}], y=>[0] },
            ReprType     => 'line',
            ReprColor    => 'black',
          },
          ],
        scale => [
          { PlotNum  => 0,
            location => 'bottom',
            color    => 'black',
          },
          { PlotNum  => 0,
            location => 'top',
            color    => 'black',
          },
          { PlotNum  => 0,
            location => 'left',
            color    => 'black',
          },
          { PlotNum  => 0,
            location => 'right',
            color    => 'black',
          },
          ],
        );

      # save plot graph data
      $path{ImgData} = sprintf ('%s_NumCumul_img.dat', $path{base});
      if ($hOut = &GetWriteHandle($path{ImgData},-filestm=>'plot image data to file')) {
        &DataPrint (\%graph, -handle=>$hOut, -debug=>$dbg2);
      }
      # create image
      $path{ImgNumCumul} = sprintf ('%s_NumCumul.png', $path{base});
      if (&Graph (\%graph, -save=>$path{ImgNumCumul}, -debug=>$dbg2)) {
        print  "sequence length, cumulative plot: $path{ImgNumCumul}\n";
      } else {
        printf STDERR "%s. ERROR: failure in making graph (cumulative distribution)\n", &MySub;
      }

      # calculate length cumulation dependent on length
      $poPlot = Math::Plot2D->new( &DistribEmpir($pPlot,-debug=>$dbg2), -debug=>$dbg2);
      $img{StepNum} = $img{width} * 2;
      $img{StepSize} = ($poPlot->Xmax() - $poPlot->Xmin()) / $img{StepNum};
      $xCurr = $xLast = $poPlot->Xmin();
      my @PlotCumulLen;
      while ($xCurr <= $poPlot->Xmax()) {
        $integral += $poPlot->Integral ($xLast, $xCurr);
        push @PlotCumulLen, [ $xCurr, $integral ];
        $xLast = $xCurr;
        $xCurr += $img{StepSize};
      }
      foreach (@PlotCumulLen) { $_->[1] = $integral - $_->[1]; }
      if ($debug) {
        printf STDERR "%s. cumulative length distribution, size %d\n", &MySub,
          (int(@PlotCumulLen)-2)/2;
        foreach (@PlotCumulLen) {
          printf STDERR "%s\t%s\taha\n", $_->[0], $_->[1];
        }
      }

      # plot graph - length cumulation
      %graph = (
        BgTranspar => $ProgOpt{-OutImgTransp},
        plot => [
          { DimPixel     => { x=>$img{width} },
            HeightRel    => $img{RelHigh},
            DataType     => 'AA',
            data         => \@PlotCumulLen,
            DataRange    => { x=>[$img{OffX}], y=>[0] },
            ReprType     => 'line',
            ReprColor    => 'black',
          },
          ],
        scale => [
          { PlotNum  => 0,
            location => 'bottom',
            color    => 'black',
          },
          { PlotNum  => 0,
            location => 'top',
            color    => 'black',
          },
          { PlotNum  => 0,
            location => 'left',
            color    => 'black',
          },
          { PlotNum  => 0,
            location => 'right',
            color    => 'black',
          },
          ],
        );

      # save plot graph data
      $path{ImgData} = sprintf ('%s_LenCumul_img.dat', $path{base});
      if ($hOut = FileHandle->new($path{ImgData},'w')) {
        printf "writing plot image data to file %s\n", $path{ImgData};
      } else {
        printf STDERR "%s. ERROR: unable to write file %s\n", &MySub, $path{ImgData};
      }
      &DataPrint (\%graph, -handle=>$hOut, -debug=>$dbg2);
      # create image
      $path{ImgLenCumul} = sprintf ('%s_LenCumul.png', $path{base});
      if (&Graph (\%graph, -save=>$path{ImgLenCumul}, -debug=>$dbg2)) {
        print  "sequence length, cumulative plot: $path{ImgLenCumul}\n";
      } else {
        printf STDERR "%s. ERROR: failure in making graph (cumulative distribution)\n", &MySub;
      }
    }

    # warning
    else {
      print  STDERR "NOTE: use program switch -OutImg to enforce graphical output\n";
    }
  }
}


# sequence letter statistics
#
# INTERFACE
# - options:
#   -individ    print statistics for each individual sequence entry
#
# - global options:
#   -debug      [STD]
#
# DESCRIPTION
# - the sequence type is derived from the last sequence entry
#
sub ProgStatSmb {
  my (%opt) = @_;
  my ($debug, $dbg2);
  my ($pSeq, $pSeqLast, $SeqType, $bNt);
  my ($CtSeq, $length, $CtNt);
  my ($pCtSymbSeq, %CtSymbSum, %symbol, @individ, @column, $pEntry);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;

  # evaluation loop over sequence entries
  while ($pSeq = &SeqQueue()) {
    $CtSeq ++;
    $pSeqLast = $pSeq;

    # count letters in pure sequences
    $length = length $$pSeq{sequence};
    $CtNt += $length;

    # count letter frequencies
    $pCtSymbSeq = &SeqCodeCount ($$pSeq{sequence}, -debug=>$dbg2);
    foreach (keys %$pCtSymbSeq) {
      $CtSymbSum{$_} += $$pCtSymbSeq{$_};
    }
    if ($opt{-individ}) {
      @symbol{keys %$pCtSymbSeq} = (1) x int keys (%$pCtSymbSeq);
      $$pCtSymbSeq{id} = $$pSeq{id};
      $$pCtSymbSeq{length} = $length;
      push @individ, $pCtSymbSeq;
    }
  }
  unless ($CtSeq||0 and $CtNt||0) {
    print  STDERR "ERROR: empty input\n";
    return;
  }

  # sequence type
  $SeqType = &SeqType ($$pSeqLast{sequence}, -basic=>1);
  $bNt = int ($SeqType eq 'nucleotide');

  # print main statistics
  printf "\nstatistical report for sequence file%s: %s\n",
    (@ProgArg==1) ? '':'s', join(', ',@ProgArg);
  printf "sequence entries: %d\n", int $CtSeq;
  printf "total %s: %d\n",
    ($SeqType eq 'protein') ? 'aa residues' : 'nucleotides', int $CtNt;
  printf "average sequence length: %d\n", ($CtSeq) ? $CtNt / $CtSeq : 0;

  # calculate and print overall frequency statistics
  printf "\noverall %s counts:\n",
    ($SeqType eq 'protein') ? 'amino acid':'nucleotide';
  foreach (sort keys %CtSymbSum) {
  printf "%s\t%d\n", $_, $CtSymbSum{$_};
  }
  if ($bNt) {
    $CtSymbSum{'A/T'} = &Sum (@CtSymbSum{'a','A','t','T'}) * $CtNt
      / (&Sum (@CtSymbSum{'a','c','g','t','A','C','G','T'}) || 0.0001);
    $CtSymbSum{'C/G'} = &Sum (@CtSymbSum{'c','C','g','G'}) * $CtNt
      / (&Sum (@CtSymbSum{'a','c','g','t','A','C','G','T'}) || 0.0001);
  }
  printf "\noverall %s frequency:\n",
    ($SeqType eq 'protein') ? 'amino acid':'nucleotide';
  foreach (sort { length($a)<=>length($b) or $a cmp $b } grep { $CtSymbSum{$_} } keys %CtSymbSum) {
  printf "%s\t%.3f\n", $_, $CtNt ? $CtSymbSum{$_} / $CtNt : 0;
  }

  # individual frequency statistics
  if ($opt{-individ}) {
    @column = ('id', 'length', sort keys (%symbol));
    printf "\nindividual %s counts:\n",
      ($SeqType eq 'protein') ? 'amino acid':'nucleotide';
    printf "%s\n", join("\t",@column);
    foreach (@individ) {
    printf "%s\n", join ("\t", map { $_||0 } @{$_}{@column});
    }
    printf "\nindividual %s frequency:\n",
      ($SeqType eq 'protein') ? 'amino acid':'nucleotide';
    if ($bNt) { push @column, 'A/T', 'C/G'; }
    printf "%s\n", join("\t",@column);
    foreach $pEntry (@individ) {
      if ($bNt) {
        $$pEntry{'A/T'} = sprintf ("%.3f", &Sum (@$pEntry{'a','t','A','T'}) / (&Sum (@$pEntry{'a','c','g','t','A','C','G','T'}) || 0.0001));
        $$pEntry{'C/G'} = sprintf ("%.3f", &Sum (@$pEntry{'c','g','C','G'}) / (&Sum (@$pEntry{'a','c','g','t','A','C','G','T'}) || 0.0001));
      }
      foreach (keys %symbol) {
        $$pEntry{$_} = sprintf ("%.3f", $$pEntry{length} ? $$pEntry{$_} / $$pEntry{length} : 0);
      }
      printf "%s\n", join ("\t", @$pEntry{@column});
    }
  }
}


# randomise sequence entries
#
sub ProgRandOrder {

  # read complete input
  my (@seq);
  while (defined (my $pSeq=&SeqQueue())) { push (@seq,$pSeq) }

  # re-output sequence in randomised order
  $ProgParam{store}{queue}{SeqOut}->Push(map{@{$_||[]}} &RandArrayOrder(\@seq));
}


# randomise input and re-output sequence segments
#
# INTERFACE
# - global options:
#   -debug      [STD]
#   --size      fragment size (unit of randimization)
#   --sd        standard deviation for fragment size randomization
#   --spacesd   standard deviation for randomized fragment spacing
#
# DESCRIPTION
# - pick fragments from the (concatenated) input sequence randomly. Optionally,
#   intervening spacer fragments turn the randomization process nearly perfect
#   (optional argument --spacesd).
#
sub ProgRandFragment {
  my $debug = $ProgOpt{-debug};
  my $FragSz = $ProgOpt{-var}{size} || 1000;
  my $FragSD = $ProgOpt{-var}{sd};
  my $SpaceSz = $ProgOpt{-var}{spacesd};

  # read input
  my @seq;
  while (defined (my $pSeq=&SeqQueue())) {
    push @seq, $$pSeq{sequence};
  }
  my $sSeq = join ($ProgOpt{-var}{join}||'', @seq);
  my $iSeqLen = length($sSeq);
  $debug and printf STDERR "%s. read %d sequence%s, total length %d\n", &MySub,
    int(@seq), (@seq==1) ? '':'s', $iSeqLen;
  undef @seq;

  # random fragments
  my $CtSeg=0;
  my $FragSum=0;
  my $SpaceSwitch=0;
  while ($FragSum < $iSeqLen) {
    # generate sequence fragment
    if ($SpaceSwitch) {
      my $FragLen = &nearest (1, &ProgRandGauss($FragSz,$ProgOpt{-var}{sd}));
      my $sSeqSeg = substr ($sSeq, $FragSum, &Min($FragLen,$iSeqLen-$FragSum));
      $ProgParam{store}{queue}{SeqOut}->Push(
        { id=>'RandFragment'.(++$CtSeg), sequence=>$sSeqSeg }
        );
      $FragSum += $FragLen;
    }
    # apply fragment spacer
    elsif ($SpaceSz) {
      $FragSum += &nearest (1, &ProgRandGauss(0,$SpaceSz,-oneside=>1));
    }
    $SpaceSwitch ^= 1;
  }
}


# randomise input and re-output sequence segments
#
# INTERFACE
# - global options:
#   -debug      [STD]
#   --size      fragment size (unit of randimization)
#   --sd        standard deviation for randomization of fragment size
#   --seg       number and size of final segments
#
# DESCRIPTION
# - pick fragments from the (concatenated) input sequence randomly and rejoin
#   them to a (small) number of segments. 
# - An example
#
#   input   ---------------------------------
#   segA    -  - -- - -  - -- - -  -- - - - -
#   segB    -- -  - - - - - -  -- - - - -- - 
#
#   Note that the fragments are concatenated in random order, and they may have
#   variable size (randomized using optional parameter --sd).
#
sub ProgRandSegment {
  my $debug = $ProgOpt{-debug};
  my $FragSz = $ProgOpt{-var}{size} || 1000;
  my $FragSD = $ProgOpt{-var}{sd};
  my @SegSzrel = $ProgOpt{-var}{seg} ?
    split(',',$ProgOpt{-var}{seg}) : (0.5,0.5);

  # read input
  my @seq;
  while (defined (my $pSeq=&SeqQueue())) {
    push @seq, $$pSeq{sequence};
  }
  my $sSeq = join ($ProgOpt{-var}{join}||'', @seq);
  my $iSeqLen = length($sSeq);
  $debug and printf STDERR "%s. read %d sequence%s, total length %d\n", &MySub,
    int(@seq), (@seq==1) ? '':'s', $iSeqLen;
  undef @seq;

  # shift requested fragment borders such that resulting borders scatter around
  # the desired rel. size
  for (my $CtSeg=0; $CtSeg<int(@SegSzrel)-1; $CtSeg++) {
    $SegSzrel[$CtSeg] -= $FragSz / 2 / $iSeqLen;
  }

  # generate fragment coordinates
  my @frag;
  {
    my $FragSum = 0;
    while ($FragSum < $iSeqLen) {
      # random fragment length
      my $RandLen = $FragSD? &ProgRandGauss($FragSz,$ProgOpt{-var}{sd}) : $FragSz;
      my $FragLen = &nearest (1, $RandLen);
      # full fragment coordinate
      push @frag, { off=>$FragSum, len=>&Min($FragLen,$iSeqLen-$FragSum) };
      $FragSum += $FragLen;
    }
  }
  # randomise order of fragments
  @frag = map{@{$_||[]}} &RandArrayOrder(\@frag);

  # re-join sequence fragments to yield segments, output
  for (my $CtSeg=0; $CtSeg<@SegSzrel; $CtSeg++) {
    my $FragSum = 0;
    my $sSeqSeg = '';
    while ($FragSum<$SegSzrel[$CtSeg]*$iSeqLen and @frag) {
      my $pFrag = shift @frag;
      $sSeqSeg .= ($FragSum ? ($ProgOpt{-var}{join}||''):'')
        . substr ($sSeq, $$pFrag{off}, $$pFrag{len});
      $FragSum += $$pFrag{len};
    }

    # remind sequence ID in global storage, output sequence
    $ProgParam{store}{queue}{SeqOut}->Push({id=>'RandSegment'.($CtSeg+1),sequence=>$sSeqSeg});
  }
}


# random values following normal-distribution
#
# INTERFACE
# - argument 1:  my value
# - argument 2:  sigma value
#
# - options:
#   -oneside     one-sided scattering, default two-sided
#
sub ProgRandGauss {
  my ($ArgMy,$ArgSigma,%opt) = @_;
  $ArgSigma or return $ArgMy;

  # need two components:
  # - sign
  # - homogeneously random value in range [0,1[
  my $ValRand = (rand()-0.5) * 2;
  my $sign = $opt{-oneside} ? 1 : &Sign($ValRand);

  # evaluate to position in Gaussian distribution
  my $ValTimesS = &GaussLimit (abs($ValRand));
  return $ArgMy + $sign * $ValTimesS * $ArgSigma;
}


# add entry to output of non-redundant for sequence identifiers
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
# - global options:
#   -debug      [STD]
#
# DESCRIPTION
# - The first encountered sequence entry having a multiple-occurring ID will
#   be found in the output.
#
sub ProgUniqueID {
  my ($pSeq) = @_;
  my $debug = $ProgOpt{-debug};

  # ID already read?
  if ($ProgParam{store}{seq}{$$pSeq{id}}) {
    $debug and print  STDERR "double occurrence of ID $$pSeq{id}\n";
    return;
  }

  # remind sequence ID in global storage, output sequence
  $ProgParam{store}{seq}{$$pSeq{id}} = 1;
  $ProgParam{store}{queue}{SeqOut}->Push($pSeq);
}


# create BLAST database
#
# INTERFACE
# - global options:
#   -db          database target
#   -DbTitle     database title
#   -DbType      database type
#   -debug       [STD]
#
# - global data:
#   {default}{BlastDbType}  database type
#
# DESCRIPTION
# - This function doesn't have any arguments. Sequence input is got via
#   &SeqQueue.
# - There exist different incompatible BLAST database formats,
#   (cf. http://blast.wustl.edu/blast/dbfmts.html):
#   NCBI 1.4 nt/aa  - compatible to BLAST-NCBI 1.4 and BLAST-WU 2.0
#                   - created by programs pressdb (nt) or setdb (aa)
#                   - diagnostic file suffices:
#                     - nt: csq nhd ntb
#                     - aa: bsq ahd atb
#   NCBI 2.0 nt/aa  - compatible to BLAST-NCBI 2.0
#                   - created by program formatdb
#                   - diagnostic file suffices:
#                     - nt: nsq nhr nin
#                     - aa: ?
#   XDF 1 nt/aa     - compatible to BLAST-WU 2.0 licensed
#                   - created by xdformat
#                   - suffices of database files:
#                     - nt: xnd xns xnt
#                     - aa: xnd xns xnt
#   This function is able to create NCBI 1.4 and NCBI 2.0 databases of
#   type nt/aa.
# - The database format programs are pipeable, and we use this feature:
#   pressdb|setdb ... -
#   formatdb -i stdin ...
# - The sequence type is ruled out from the last sequence entry of the
#   last input file. Knowledge of the type is needed for appropriate
#   choice of the DB v1.4 formatting program (pressdb or setdb) or
#   formatting program settings (formatdb).
# - concerning sequence & DB output target:
#   - pressdb provides option -o for specifying the output target. The target
#     argument without a directory specification will automatically be
#     expanded to $ENV{BLASTDB}/$target within this code.
#     There's another kPerl package-specific environment variable,
#     $ENV{BLASTDBW}, representing priority write directory. This enables
#     you to manage different BLAST DB directories for reading/writing (you
#     may have some system-/ location-specific extravagancies).
#   - The setdb program creates the database in the directory of the
#     input file. Since this behaviour cannot be influenced, the output has
#     to be moved to the target path.
#   - formatdb provides option -o for specifying the output target. The target
#
# DEBUG, CHANGES, ADDITIONS
# - pipe purified sequence directly into formatting program.
#   Don't spend time (and nerves) on large temporary files
#   Then we need to open a writable process pipe instead of building a
#   system call closure.
#
sub ProgBlastDb {
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;
  my $DbType = $ProgOpt{-DbType} || $ProgParam{default}{BlastDbType};
  my $DbSizeLimit = 2e+09;

  my ($PathSeqpure, $pSeq, $pSeqLast, $SeqType, $SeqCt);
  my ($PathDb, $PathDbTitle, $PathProg, %opt);
  my ($pCall, $PathLog, $PathErrTmp, $PathErr, $PathTmp, $bErr);

  # output database name, path, title
  unless ($PathDb = $ProgOpt{-db}) {
    die "ERROR: no target database specified\n";
  }
  if ($PathDb !~ m|/|) {
    $PathDb = ($ENV{BLASTDBW}||$ENV{BLASTDB}) .'/'. $PathDb;
  }
  $PathDbTitle = $ProgOpt{-DbTitle} ||= &PathSplit ($PathDb)->{name};
  $opt{title} = "-t '$PathDbTitle'";

  # produce single clean fastA file
  # - option -pure was already set for SeqStreamIn object.
  #   You should set -pure=DNA5 for nt input by your own
  $PathSeqpure = $ProgParam{TmpManag}->Create();
  open (OUTFASTA, ">$PathSeqpure");
  while ($pSeq = &SeqQueue()) {
    print  OUTFASTA &SeqentryToFasta ($pSeq, -debug=>$dbg2);
    $pSeqLast = $pSeq;
    $SeqCt ++;
    if (defined($DbSizeLimit) and -s($PathSeqpure)>$DbSizeLimit) {
      if ($DbType eq 'NCBI 2.0') {
        $opt{vol} = "-v 2000000000";
        undef $DbSizeLimit;
      } else {
        printf STDERR "WARNING: reached database size limit (%s) with %d seqs\n", $DbSizeLimit, $SeqCt;
        last;
      }
    }
  }
  close OUTFASTA;
  unless ($SeqCt) {
    print  STDERR "ERROR: no sequences found in input\n";
    exit 1;
  }
  $debug and printf STDERR "%s. %d sequences found in input, tmp output to %s\n", &MySub,
    $SeqCt, $PathSeqpure||"''";

  # determine sequence type from sequence string of last sequence entry
  # decide: program, input option, output option, seq type option
  unless ($SeqType = &SeqType($$pSeqLast{sequence},-basic=>1,-debug=>$dbg2)) {
    print  STDERR "ERROR: unable to determine sequence type\n";
    exit 1;
  }
  $debug and printf STDERR "%s. input sequence type: %s\n", &MySub, $SeqType||"''";
  if ($DbType eq 'NCBI 1.4') {
    if ($SeqType eq 'protein') {
      $opt{prog} = $PathProg = $CorePath{call}{blast}{setdb};
    } else {
      $opt{prog} = $PathProg = $CorePath{call}{blast}{pressdb};
      $opt{out} = "-o $PathDb";
    }
    $opt{in} = $PathSeqpure;
    $PathLog = "$PathDb.". &PathSplit($PathProg)->{name} .'.log';
    $PathErr = "$PathDb.". &PathSplit($PathProg)->{name} .'.err';
    $opt{log} = "> $PathLog";
  } elsif ($DbType eq 'NCBI 2.0') {
    $opt{prog} = $PathProg = $CorePath{call}{blast}{formatdb};
    if ($SeqType eq 'nucleotide') {
      $opt{prog} .= " -p F";
    }
    $opt{in} = "-i $PathSeqpure";
    $opt{out} = "-n $PathDb";
    $PathLog = "$PathDb.". &PathSplit($PathProg)->{name} .'.log';
    $PathErr = "$PathDb.". &PathSplit($PathProg)->{name} .'.err';
    $opt{log} = "-l $PathLog";
  } else {
    printf STDERR "ERROR: unknown database sequence type %s\n", $DbType||"''";
    exit 1;
  }
  unlink $PathLog;

  # start formatting process
  # - $opt{log} must be the last in the list of options (may contain pipe symbol)
  # - do system call
  $PathErrTmp = $ProgParam{TmpManag}->Create(-touch=>1);
  $pCall = &CallExtClosure ($opt{prog}, $opt{out}, $opt{title}, $opt{in}, $opt{vol}, $opt{log}, "2>$PathErrTmp");
  $debug and printf STDERR "%s. calling:\n  %s\n", &MySub, &$pCall ('WhatRUCalling');
  # - eventually rise error flag
  # - combine process protocols
  # - exit if error occurred
  $bErr = ( (! &$pCall()) or (! -s $PathLog)
    or ((-s $PathErrTmp) and &ReadFile($PathErrTmp)=~m/error/i) );
  unless ($bErr) { unlink $PathErrTmp; }
  print  &ReadFile ($PathLog);
  if ($bErr) {
    printf STDERR "ERROR in process, call: %s\n", &$pCall ('WhatRUCalling');
    print  &ReadFile ($PathErr);
    &mv ($PathErrTmp, $PathErr);
    exit 1;
  }

  # move database output in case of setdb call
  if ($DbType eq 'NCBI 1.4' and $SeqType eq 'protein') {
    foreach ('', '.ahd', '.atb', '.bsq') {
      &mv ($PathSeqpure.$_, $PathDb.$_);
    }
  }

  # move or delete fastA output
  if ($ProgOpt{-OutSeq} and ($PathTmp = (grep { -e $_ } $PathDb, $PathSeqpure)[0])) {
    # xdformat in pressdb/setdb mode does not create fastA output
    # Then, instead we use the file of purified sequence input
    &mv ($PathTmp, $ProgOpt{-OutSeq});
    unlink $PathSeqpure, $PathErrTmp;
  } else {
    unlink $PathDb;
    $debug and printf STDERR "%s. deleting fastA output: $PathDb\n", &MySub;
  }

  # tidy up
  $debug or unlink $PathSeqpure;
}
# $Id: SeqHandle.pl,v 1.54 2008/06/11 08:44:58 szafrans Exp $
