#! /usr/local/bin/perl
################################################################################
#
#  K's Conversion Tool
#
#  copyright (c)
#  Karol Szafranski at Inst. Physiol. Chem. Dsseldorf, 1998
#  Karol Szafranski on behalf of IMB Jena, Dept. Genome Analysis, 1998-2004,
#    szafrans@imb-jena.de
#  Karol Szafranski at UPenn Philadelphia, Center for Bioinformatics, 2005,
#    karol@pcbi.upenn.edu
#  Karol Szafranski on behalf of FLI Jena, Genome Analysis Group, 2005,
#    szafrans@fli-leibniz.de
#
################################################################################
#
# DESCRIPTION
#
# - See function &usage for description of command line syntax
#
# - each function comes along with a description at the beginning of the code
#   block
#
################################################################################
#
#  FUNCTIONS, DATA
#
# - MAIN
#   %GlobStore
#   $ProgFile
#   %ProgParam
#   $ProgMode, %ProgOpt, @ProgArg
#
# - usage help, command line arguments, basic I/O
#   &usage
#   &AddSwitch
#   &PrepOstump
#
# - basic search/replacement & text formats
#   &ProgChg
#   &ProgLF
#   &ProgLineNum
#   &ProgHtmlIndex
#
# - list matching and filtering
#   &ProgFilterHead
#   &ProgFilterId
#   &ProgFilter
#   &ProgUnique
#   &ProgRandom
#
# - table structures & formats
#   &ProgTabColComb
#   &ProgTabColMerge
#   &ProgTabColOrder
#   &ProgTabHtml
#   &ProgTab2Html
#   &ProgTabBlastPair
#   ------------------------ miscellaneous in alphabetical order:
#   &ProgTabCentromere
#   &ProgTabCrosscalc
#   &ProgTabCumul
#   &ProgTabData*
#   &ProgTabDdtScreen
#   &ProgTabDistSetComb
#   &ProgTabHappyFrom
#   &ProgTabHappyTo
#   &ProgTabInvPcr
#
# - special format conversions in alphabetical order
#   &ProgAlignErr
#   &ProgBlzlav2tab
#   &ProgBlzpippsFilter
#   &ProgCogLine
#   &ProgGbRel2div
#   &ProgGtKtalog
#   &ProgMan
#   &ProgPfam2go
#
################################################################################
#
#  DEBUG, CHANGES, ADDITIONS
#
# - look also for notes in the header of each function block
#
################################################################################


# global constants and variables
# also used by modules as a unique global data anchor
our %GlobStore;

# include path(s), includes
BEGIN {
  unshift @INC, grep{$_} split(/:+/,$ENV{KPERLPATH}||$ENV{PERLPATH}||'');
}
#use strict; use warnings;  # OK 20050219
use FileHandle;
use MainLib::StrRegexp;
use MainLib::Data;
use MainLib::Path;
use MainLib::Cmdline qw(&GetoptsNArgs &InargFileExists);
use MainLib::File;
use MainLib::Misc;
use MainLib::Internet qw(&HtmlCharEncode);
use Math::Calc;
use Math::Random qw(&RandArrayOrder);
use Math::Plot2D;
use database::DbPlain;
use database::Table qw(&TableConvert);


# script ID
our $ProgFile = ( split('/',__FILE__) )[-1];

# global constants (esp. default values)
our %ProgParam;
$ProgParam{default}{ProgMode} = 'LFn';

# working desk
$ProgParam{store} = undef;


# manage I/O #####################################

# organize I/O handles
&Unbuffer();


# command line interface #########################
# NOTE:
# - &GetoptsNArgs (via &AddSwitch) modifies global variables:
#   $ProgMode @ProgArg %ProgOpt
# - @ProgArg may be pre-filled in &GetoptsNArgs via &AddSwitch (option -fofn)

# arguments, switches, default subprogram
our $ProgMode = undef;
our @ProgArg = ();
our %ProgOpt = ();
@ProgArg = &GetoptsNArgs();
$ProgMode ||= $ProgParam{default}{ProgMode};


# work flow manifold #############################

# chain to program mode without input argument(s)
if (0) { }
elsif (!@ARGV or $ProgMode=~m/^h(elp)?$/i) { &usage() }

# ensure input argument(s)
unless (@ProgArg) {
  print  STDERR "ERROR: input arguments missing\n";
  exit 1;
}
# validate input argument(s)
my $arg;
foreach $arg (@ProgArg) {
  unless ( $arg eq '-' or -s &PathExpand($arg) ) {
    printf STDERR "WARNING: input file %s does not exist or has zero length\n", $arg||"''";
  }
}

# chain to program mode (with input argument(s))
if (0) { }
elsif ($ProgMode =~ m/^AlignError$/i) {
  foreach $arg (@ProgArg) {
    &InargFileExists($arg,-stdin=>1) or next;
    &ProgAlignErr2 ($arg);
  }
}
elsif ($ProgMode =~ m/^blzlav2tab$/i) {
  &ProgBlzlav2tab (@ProgArg);
}
elsif ($ProgMode =~ m/^BlzpippsFilter$/i) {
  &ProgBlzpippsFilter (@ProgArg);
}
elsif ($ProgMode =~ m/^chg=(.*)$/i) {
  $ProgOpt{-ChgTo} = $1;
  unless ($ProgOpt{-search}) {
    print  STDERR "ERROR: no search string specified\n";
    exit 1;
  }
  foreach $arg (@ProgArg) {
    &ProgChg ($arg);
  }
}
elsif ($ProgMode =~ m/^COGlines?$/i) {
  &ProgCogLine ($ProgArg[0]);
}
elsif ($ProgMode =~ m/^ColComb$/i) {
  &ProgTabColComb (@ProgArg);
}
elsif ($ProgMode =~ m/^ColMerge(=(\d+))?$/i) {
  $ProgOpt{-var}{ColTgt} = $2;
  &ProgTabColMerge (@ProgArg);
}
elsif ($ProgMode =~ m/^ColOrder=(.+)$/i) {
  $ProgOpt{-ColOrder} = [ split(/,/,$1) ];
  &ProgTabColOrder (@ProgArg);
}
elsif ($ProgMode =~ m/^filter=(.+)$/i) {
  $ProgOpt{-filter} = $1;
  &ProgFilter (@ProgArg);
}
elsif ($ProgMode =~ m/^FilterHead=(.+)$/i) {
  $ProgOpt{-FilterHead} = $1;
  &ProgFilterHead (@ProgArg);
}
elsif ($ProgMode =~ m/^FilterID=(.+)$/i) {
  $ProgOpt{-filter} = $1;
  &ProgFilterId (@ProgArg);
}
elsif ($ProgMode =~ m/^GbRel2div$/i) {
  &ProgGbRel2div (@ProgArg);
}
elsif ($ProgMode =~ m/^GbRel2vers$/i) {
  &ProgGbRel2vers (@ProgArg);
}
elsif ($ProgMode =~ m/^GtKtalog$/i) {
  &ProgGtKtalog (@ProgArg);
}
elsif ($ProgMode =~ m/^html$/i) {
  foreach $arg (@ProgArg) {
    print &HtmlCharEncode (scalar &ReadFile($arg));
  }
}
elsif ($ProgMode =~ m/^HtmlIndex$/i) {
  &ProgHtmlIndex (@ProgArg);
}
elsif ($ProgMode =~ m/^LFn$/i) {
  &InargFileExists($ProgArg[0],-stdin=>1,-exit=>\&usage);
  &ProgLF (shift @ProgArg);
}
elsif ($ProgMode =~ m/^LFr$/i) {
  &InargFileExists($ProgArg[0],-stdin=>1,-exit=>\&usage);
  &ProgLF (shift @ProgArg, -LF=>"\r");
}
elsif ($ProgMode =~ m/^LFrn$/i) {
  &InargFileExists($ProgArg[0],-stdin=>1,-exit=>\&usage);
  &ProgLF (shift @ProgArg, -LF=>"\r\n");
}
elsif ($ProgMode =~ m/^LineBreak$/i) {
  foreach $arg (@ProgArg) {
    &InargFileExists($arg,-stdin=>1) or next;
    print  &LineBreak (scalar &ReadFile($arg),
      -length=>$ProgOpt{-var}{len}, -rm=>1);
  }
}
elsif ($ProgMode =~ m/^(Line)?Num$/i) {
  for (my $CtArg=0; $CtArg<int(@ProgArg); ++$CtArg) {
    unless (&InargFileExists($ProgArg[$CtArg],-stdin=>1)) {
      splice @ProgArg, $CtArg, 1; $CtArg --;
    }
  }
  &ProgLineNum (@ProgArg);
}
elsif ($ProgMode =~ m/^man$/i) {
  foreach $arg (@ProgArg) {
    &ProgMan ($arg);
  }
}
elsif ($ProgMode =~ m/^match=(.+)$/i) {
  $ProgOpt{-filter} = $1;
  $ProgOpt{-ActNegative}  = 1;
  &ProgFilter (@ProgArg);
}
elsif ($ProgMode =~ m/^matchid=(.+)$/i) {
  $ProgOpt{-filter} = $1;
  $ProgOpt{-ActNegative}  = 1;
  &ProgFilterId (@ProgArg);
}
elsif ($ProgMode =~ m/^NonUnique$/i) {
  $ProgOpt{-ActNegative} = 1;
  &ProgUnique (@ProgArg);
}
elsif ($ProgMode =~ m/^order$/i) {
  &ProgOrder (@ProgArg);
}
elsif ($ProgMode =~ m/^pfam2go$/i) {
  &ProgPfam2go (@ProgArg);
}
elsif ($ProgMode =~ m/^random$/i) {
  &ProgRandom (@ProgArg);
}
elsif ($ProgMode =~ m/^TabBlastPair$/i) {
  &InargFileExists($ProgArg[0],-stdin=>1,-exit=>\&usage);
  &ProgTabBlastPair (@ProgArg);
}
elsif ($ProgMode =~ m/^TabCentromere$/i) {
  &ProgTabCentromere (@ProgArg);
}
elsif ($ProgMode =~ m/^TabCrosscalc=(.+)$/i) {
  &ProgTabCrosscalc (@ProgArg[0,1], $1);
}
elsif ($ProgMode =~ m/^TabCumul$/i) {
  &ProgTabCumul (@ProgArg);
}
elsif ($ProgMode =~ m/^TabData$/i) {
  &ProgTabDataPlot2D (@ProgArg);
}
elsif ($ProgMode =~ m/^TabDistSetComb$/i) {
  &ProgTabDistSetComb (@ProgArg);
}
elsif ($ProgMode =~ m/^TabHappyFrom$/i) {
  &ProgTabHappyFrom (@ProgArg);
}
elsif ($ProgMode =~ m/^TabHappyTo$/i) {
  &ProgTabHappyTo (@ProgArg);
}
elsif ($ProgMode =~ m/^TabHtml$/i) {
  &InargFileExists($ProgArg[0],-stdin=>1,-exit=>\&usage);
  &ProgTabHtml (@ProgArg);
}
elsif ($ProgMode =~ m/^Tab2Html$/i) {
  &InargFileExists($ProgArg[0],-stdin=>1,-exit=>\&usage);
  &ProgTab2Html (@ProgArg);
}
elsif ($ProgMode =~ m/^TabInvPcr$/i) {
  &ProgTabInvPcr (@ProgArg);
}
elsif ($ProgMode =~ m/^TabTmp$/i) {
  &ProgTabHappyTo (@ProgArg);
}
elsif ($ProgMode =~ m/^temp$/i) {
  &InargFileExists($ProgArg[0],-stdin=>1,-exit=>\&usage);
  &ProgTemp (@ProgArg);
}
elsif ($ProgMode =~ m/^unique$/i) {
  &InargFileExists($ProgArg[0],-stdin=>1,-exit=>\&usage);
  &ProgUnique (@ProgArg);
}
else {
  print  STDERR "ERROR: unknown program mode or switch '$ProgMode'\n";
  exit 1;
}

# exit script successfully
# cf. END block!
exit 0;


################################################################################
# usage help, command line arguments
################################################################################


sub usage {
  print "\n";
  print <<END_USAGE;
DESCRIPTION
 $ProgFile is used for conversion and manipulation of text-based file
 formats. Output is written to STDOUT.

COMMAND LINE SYNTAX
 $ProgFile  -<ModeSwitch> [-<OptionalSwitch> ...] <Arg1> [<Arg2> ...]

Arguments
---------
 Typically, args specify input files. Deviations are stated in the
 listing of ModeSwitches below.

path arguments:
 Relative paths will be resolved according to the pwd. Prefixes "~" and "~uid"
 are resolved to the home directories. Path "-" resolves to STDIN or STDOUT,
 depending on the context.

ModeSwitch (case-insensitive)
-----------------------------
switch argument types: B:=boolean, F:=floating point/scientific, N:=integer,
S:=string, X:=varying type.

<none>            default ModeSwitch -$ProgParam{default}{ProgMode} if program arguments are given.
                  Otherwise like ModeSwitch -help.
-AlignError       filter problem entries from AlnK project file
-blzlav2tab       convert *blz2c.lav file to extended tabular format which
                  is output to STDOUT
                  Arg1        *blz2c.lav file
-BlzpippsFilter   filter *blz2c.pip*.ps file and re-output to STDOUT
                  Arg1        *blz2c.pip*.ps file
                  Arg2        cut-off value for local alignment length
-chg=S            regular expressing changing. S specifies replace string,
                  search pattern is passed via switch -search.
                  --chgexpr=1 replace pattern is an expression, default: quoted
                              string
-COGline(s)       extract informative data line from a HTML report received
                  from an NCBI COG server query
-ColComb          combine tables column-wise, assuming lines are in
                  corresponding order. Multiple occurrence of the same
                  column label does not matter.
-ColMerge         combine tables column-wise, assuming lines are NOT in
                  corresponding order. Instead, the corresponding line in the
                  2nd table is found by indexation (default: 1st field).
                  Multiple occurrence of the same column label does not matter.
                  Arg1        master table
                  Arg2+       additional table(s)
                  --ColIdx=N  *** to be implemented ***
                  --ColTgt=N  column number at which to insert, numbering
                              offset at 0
-ColOrder=S       change column order in table
                  S           comma-separated list of column labels. Default
                              labelling if there're no label definitions in
                              the table file: A  B  ...
                  --splice=1  remove columns not specified in the list
-filter=S         filter off list entries matching to any entry in a list
                  (file or plain argument) of regular expressions.
                  -OutTabType=AA
                              matching is checked against full input line.
                              Output misses column labels hidden in comments.
-FilterHead=N     filter N heading lines
-FilterID=S       filter off list entries fully matching to any entry in a
                  list (file or plain argument)
                  -OutTabType=AA
                              Output misses column labels hidden in comments.
-GbRel2div        extract division labels from GenBank release file
                  "gbrel.txt". Output in lower-case spelling.
-GbRel2vers       extract release number from GenBank release file "gbrel.txt"
-GtKatalog        convert GtKtalog report to list of filenames. Bernd may tell
                  you exactly what this is good for.
-h(elp)           output command line syntax description and exit
-html             encode text for HTML document
-HtmlIndex        create index of headlines for HTML document
-LFn              change line feed syntax to \\n (UNIX)
-LFr              change line feed syntax to \\r (Apple MacIntosh)
-LFrn             change line feed syntax to \\r\\n (PC)
-LineBreak        break lines to maximum number (std. 80) of characters
                  --len=N     maximum number of line characters, default: 80
-LineNum          insert line number at the beginning of each line
-man              convert man page to plain text
-match=S          filter for list entries matching to any entry in a list
                  (file or plain argument) of regular expressions.
                  -OutTabType=AA
                              matching is checked against full input line
                              Output misses column labels hidden in comments.
-MatchID=S        filter for list entries fully matching to any entry in a
                  list (file or plain argument)
                  -OutTabType=AA
                              Output misses column labels hidden in comments.
-NonUnique        prepare list containing only non-unique entries of source
                  list
-order            reorder table to a desired order of entries
                  Arg1        list of entries in desired order
                  Arg2+       tabular input file(s)
-pfam2go          convert pfam2go translation table to data structure format
-random           take list and change lines to random order
                  --reduce=F  reduce input to fraction, reducing memory re-
                              quirement
-TabBlastPair     list of hit pairs from Blast.pl -ListQuery table
-TabCentromere    filter hit pairs for centromer seq motifs according to
                  appropriate topology
-TabCrosscalc=S   cross-calculate y values of two plot data tables. Automatic
                  iteration is done over all x values (from both source tables),
                  and a user-defined function (stated by S) is performed on
                  variables \$y1 and \$y2.
                  --reverse=1 do reverse integration (starting with x_max)
-TabCumul         recalculate tabular plot data to integral plot
                  --resol=F   rel. resolution of value range, default 500
                  --reverse=1 do reverse integration (starting with x_max)
-TabData          convert table to data structures format
-TabDistSetComb   combine a set of pairwise distance tables
-TabHappyFrom     convert HAPPY map data table to rich fastA
-TabHappyTo       convert rich fastA to HAPPY mapping submission table
-Tab2Html         format table to HTML
-TabHtml          parse table(s) from HTML document
-TabInvPcr        convert oligo report to invPCR table entry
                  Arg1+       report(s)
-TabTmp           some temporary table calculation
-temp             temporary code
-unique           prepare list containing only unique entries of source list

OptionalSwitch (case-insensitive)
---------------------------------
switch argument types: B:=boolean, F:=floating point/scientific, N:=integer,
S:=string, X:=varying type.

-debug(=N)        print debug protocol to STDERR (sometimes STDOUT).
                  N           debug depth value
-DelimCol=S       regexp for data field splitting, default: '\\t'. Note that
                  the regexp shall not contain bracketed expressions.
-OutTabType=S     tabular dimensions to output:
                  A1          output list contains first fields of table input
                              only (default)
                  AA          output full table
-search=S         what to search for (RegExp), cmp. ModeSwitch -chg.
--*               function-specific switches. See the descriptions there.

Environment Variables
---------------------
 \$PERLPATH        primary search path for Perl package look-up
END_USAGE
  print "\n";
  exit 0;
}


# add program switches to global table (hash)
#
# INTERFACE
# - argument 1:  switch argument without leading '-'
#
# - global data:
#   $ProgMode
#   %ProgOpt  switch data which gets processed here
#
# DESCRIPTION
# - this function gets called by &MainLib::Misc::GetoptsNArgs
# - switch arguments are tested for validity. Arguments are parsed with highest
#   possible tolerance. This way, syntax errors can reported in accordance to
#   the actual switch, rather than reporting ANY syntax error.
#
sub AddSwitch {
  my $switch = shift;

  # optional switches
  if ($switch =~ m/^debug(=(\d+))?$/i) {
    $ProgOpt{-debug} = defined($2) ? int($2) : 1;
    return;
  }
  if ($switch =~ m/^delimCol=(.+)$/i) {
    $ProgOpt{-delimCol} = $1;
    return;
  }
  if ($switch =~ m/^OutTabFmt\b/i) {
    die sprintf "ERROR: unknown switch %s. Typo of -OutTabType?\n", $&;
  }
  if ($switch =~ m/^OutTabType=(.+)$/i) {
    $ProgOpt{-OutTabType} = $1;
    unless ($ProgOpt{-OutTabType} =~ m/^A[1A]$/) {
      die sprintf "ERROR: unknown table type specifier '%s'\n", $ProgOpt{-OutTabType};
    }
    return;
  }
  if ($switch =~ m/^search=(.+)$/i) {
    $ProgOpt{-search} = $1;
    return;
  }
  if ($switch =~ m/^(?:var=|-)(\w+)[,=](.+)$/i) {
    $ProgOpt{-var}{$1} = $2;
    return;
  }

  # program mode switches
  if (defined($ProgMode)) {
    die sprintf "ERROR: multiple specification of program mode or unknown switch, %s and %s\n",
      '-'.($ProgMode||"''"), '-'.($switch||"''");
  }
  else {
    $ProgMode = $switch;
    if ($ProgMode eq 'default') {
      $ProgMode = $ProgParam{default}{ProgMode};
    }
    return;
  }
}


# work out output path base
#
# INTERFACE
#
# - global options:
#   -debug       [STD]
#   -OutDir      [STD]
#   -OutStump    [STD]
#
# - return val:  output path base
#
sub PrepOstump {
  my ($debug);
  my ($PathRef, $PathStamp);

  # function parameters
  $debug = $ProgOpt{-debug};

  # prepare output path base
  # 1st: $ProgOpt{-OutStump}
  # 2nd: $ProgArg[0]
  unless ($PathStamp = $ProgOpt{-OutStump}) {
    $PathRef = $ProgArg[0];
    if ($PathRef eq '-') { $PathRef = 'stdin'; }
    $PathStamp = &PathChgSuffix (&PathExpand($PathRef), '', -last=>1);
  }

  # determine effective directory
  $ProgOpt{-OutDir} and $PathStamp =
    $ProgOpt{-OutDir} .'/'. &PathSplit($PathStamp)->{name};

  # final refinement: expand to nice, rooted path
  $PathStamp = &PathExpand ($PathStamp);

  # return path
  return $PathStamp;
}


################################################################################
# basic search/replacement & text formats
################################################################################


# replace using RegExps
#
# INTERFACE
# - argument 1: input file
#
# - global options:
#   -ChgTo      from mode switch argument
#   -debug      [STD]
#   -search     [STD]
#
# DESCRIPTION
# - features of the implementation:
#   - magic variables $1,$2,...,$`,$&$' work
#   - slashes in the search or replacement patterns do not cause problems
#
sub ProgChg {
  my ($PathIn) = @_;
  my ($debug);
  my ($doc, $search, $ChgTo, $ret);

  # function parameters
  $debug = $ProgOpt{-debug};
  unless ($search = $ProgOpt{-search}) {
    print  STDERR "ERROR: no search pattern specified\n";
    exit 1;
  }
  $ChgTo = $ProgOpt{-ChgTo};

  # process replacement parameters
  # - add backslash to sole slashes in the replace pattern
  $ChgTo =~ s{(?=^|[^\\])/}{\\/}g;
  if ($debug) {
    printf STDERR "%s. processed arguments\n", &MySub;
    printf STDERR "  search pattern: `%s'\n", $search;
    printf STDERR "  replace string: `%s'\n", $ChgTo;
  }

  # load document
  unless (defined ($doc=&ReadFile($PathIn))) {
    die sprintf "ERROR: unable to read input file %s\n", $PathIn||"''";
  }

  # perform replacement
  # - we need the eval construction in order to let magic variable statements
  #   work as such
  $debug and printf STDERR "%s. evaluating '\$doc =~ s/\$search/$ChgTo/gm'\n", &MySub;
  if ($ProgOpt{-var}{chgexpr}) {
    eval "\$doc =~ s/\$search/$ChgTo/egm"; $ret = $@;
      # note: regexp/subst modifiers may be stated inside the search patterns,
      # e.g.: m/(?i)/
  } else {
    eval "\$doc =~ s/\$search/$ChgTo/gm"; $ret = $@;
  }
  if (length ($ret)) {
    print  STDERR "ERROR in regexp substitution\n";
    exit 1;
  }

  # output
  print  $doc;
}


# change line feed syntax
#
# INTERFACE
# - argument 1: input file
#
# - options:
#   -LF         new line feed string, default: \n
#
sub ProgLF {
  my ($PathIn, %opt) = @_;
  my ($doc, $ChgTo);

  # parameters
  $ChgTo = $opt{-LF} || "\n";

  # read input file
  unless (defined ($doc=&ReadFile($PathIn))) {
    die sprintf "ERROR: unable to read input file %s\n", $PathIn||"''";
  }

  # change file, output
  $doc =~ s/(\r\n|\n\r|\r|\n)/$ChgTo/g;
  print  $doc;
}


# insert line numbers
#
# INTERFACE
# - argument 1: input file
#
sub ProgLineNum {
  my ($PathIn) = @_;
  my ($hIn, $CtI);

  # open input file
  unless ($hIn=FileHandle->new($PathIn)) {
    printf "%s. ERROR: unable to read input file %s\n", &MySub, $PathIn||"''";
    exit 1;
  }

  # read file and output with line numbers
  while (<$hIn>) {
    print  ++$CtI, ' ', $_;
  }
}


# add index of headlines to HTML document
#
# INTERFACE
# - argument 1: input file
#
sub ProgHtmlIndex {
  my ($PathIn) = @_;
  my ($debug);
  my ($doc, $PosOff);
  my (@item, $pItem, $ItemTitle, $LevelMax);

  # function parameters
  $debug = $ProgOpt{-debug};

  # read input file
  unless (defined ($doc=&ReadFile($PathIn))) {
    die sprintf "ERROR: unable to read input file %s\n", $PathIn||"''";
  }

  # grab title
  if ($doc =~ m/<TITLE>(.+?)<\/TITLE>/is) {
    $ItemTitle = $1;
    $ItemTitle =~ s/\s+([\n\t\r]+\s*)?/ /gs;
  }

  # loop over headlines
  while ($doc =~ m/<H(\d)>(.+?)<\/H\1>/isg) {

    # remind first headlin position
    unless (defined $PosOff) {
      $PosOff = pos($doc) - length($&);
    }

    # enter headline item to list
    push @item, {
      level => int($1),
      text  => $2,
      pos   => pos($doc) - length($&),
      };
    $item[-1]{text} =~ s/\s+([\n\t\r]+\s*)?/ /gs;
    $debug and printf STDERR "%s. headline at pos. %d, level %d\n", &MySub,
      $item[-1]{pos}, $item[-1]{level};

    # old index - remove it
    if ($item[-1]{text} =~ m/^index/i and
        substr ($doc, $item[-1]{pos}-30, 30) =~ m/-HtmlIndex -->/
    ) {
      $debug and printf STDERR "%s. headline at pos. %d is previous index"
        . " - removing it\n", &MySub, $PosOff;
      if ($doc =~ m/\n<BR><BR>\n/sg) {
        $doc = substr ($doc, 0, $PosOff) . substr ($doc, pos($doc));
        pos($doc) = $PosOff - 1;
      }
      undef $PosOff;
      pop @item;
    }
  }
  $debug and printf STDERR "%s. found %d headlines\n", &MySub, int@item;

  # which is the highest level?
  $LevelMax = &Min (map { $_->{level} } @item);
  $debug and printf STDERR "%s. uppermost headline level: %d\n", &MySub, $LevelMax;

  # enter missing jump labels for chapter items - change document
  # - NOTE: we have to render the document from bottom to top in order to
  #   keep unprocessed positional information intact.
  # ...

  # output index
  print  "file: $PathIn\n";
  print  "title: $ItemTitle\n";
  print  "\n";
  printf "<!-- Convert.pl -HtmlIndex --><H%d>Index</H%d>\n",
    $LevelMax, $LevelMax;
  foreach $pItem (@item) {
    printf "%s%s<BR>\n", ' ' x ($pItem->{level}-$LevelMax), $pItem->{text};
  }
  print  "<BR><BR>\n\n\n";
}


################################################################################
# list matching and filtering
################################################################################


# filter number of heading lines
#
# INTERFACE
# - argument 1+: input file(s)
#
# - global options:
#   -debug       [STD]
#
sub ProgFilterHead {
  my (@ArgIn) = @_;
  my $debug = $ProgOpt{-debug};
  my $NumFilter = $ProgOpt{-FilterHead};

  # prepare filter
  my $CtFilter=0;
  foreach my $ItFile (@ArgIn) {
    my $pHdl = FileHandle->new($ItFile);
    while (<$pHdl>) {
      ++ $CtFilter;
      if ($CtFilter <= $NumFilter) { next }
      print;
    }
  }
}


# filter by list of identifiers
#
# INTERFACE
# - argument 1+: input file(s)
#
# - global options:
#   -ActNegative switches the action from filtering matches to the behavior of
#                reporting the matches
#   -debug       [STD]
#   -filter      arg for filtering (string or name of file containing list of strings)
#   -OutTabType  [STD]
#
sub ProgFilterId {
  my (@ArgIn) = @_;
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;
  my $bActNeg = $ProgOpt{-ActNegative};

  # prepare filter
  my $ArgFlt = $ProgOpt{-filter};
  my %flt;
  if ((-r &PathExpand($ArgFlt) and -s &PathExpand($ArgFlt))
     or $ArgFlt eq '-') {
    $ArgFlt = ($ArgFlt eq '-') ? '-' : &PathExpand($ArgFlt);
    my $hFlt = FileHandle->new($ArgFlt);
    while (<$hFlt>) { m/^\S+/ and $flt{$&}=1 }
    if (%flt) {
      $debug and printf STDERR "%s. %d entries in filter file %s\n", &MySub, int(keys %flt), $ArgFlt||"''";
    } else {
      die sprintf "ERROR: unable to read filter file %s\n", $ArgFlt||"''";
    }
  } else {
    $debug and printf STDERR "%s. -filter switch argument interpreted as plain identifier: %s\n", &MySub, $ProgOpt{-filter}||"''";
    %flt = ( $ProgOpt{-filter}=>1 );
  }

  # get primary list from input file(s)
  my ($pCol,@table);
  foreach my $PathIn (@ArgIn) {
    my $pTabCurr;
    ($pTabCurr,$pCol) = &PlainToTable ($PathIn, -TabType=>'AA',
      -delimCol=>$ProgOpt{-delimCol}, -comments=>1, -debug=>$dbg2);
    if ($pTabCurr) {
      $debug and printf STDERR "%s. %d data line%s in input file %s\n", &MySub,
        int(@$pTabCurr), (@$pTabCurr==1)?'':'s', $PathIn||"''";
      push @table, @$pTabCurr;
    } else {
      printf STDERR "WARNING: no list entries in input file %s\n", $PathIn||"''";
      next;
    }
  }

  # do filtering, output
  if (int (@$pCol)) {
    printf "# column labels:\n# %s\n", join("\t",@$pCol);
  }
  foreach my $pLine (@table) {
    my $bMatch = exists $flt{$$pLine[0]};  # try full match onto first value of line
    if (($bActNeg and  $bMatch) or (!$bActNeg and !$bMatch)) {
      my $line = ($ProgOpt{-OutTabType} eq 'AA') ? join("\t",@$pLine) : $$pLine[0];
      print  "$line\n";
    } elsif ($debug) {
      printf STDERR "  entry %s %s\n",
        $$pLine[0]||"''", $bActNeg?'not matched':'filtered';
    }
  }
}


# filter by list of RegExps
#
# INTERFACE
# - argument 1+: input file(s)
#
# - global options:
#   -ActNegative switches the action from filtering matches to the behavior of
#                reporting the matches
#   -debug       [STD]
#   -filter      arg for filtering (RegExp or name of file containing list of RegExps)
#   -OutTabType  [STD]
#
sub ProgFilter {
  my (@ArgIn) = @_;
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;
  my $bActNeg = $ProgOpt{-ActNegative};

  # prepare filter
  my $ArgFlt = $ProgOpt{-filter};
  my @flt;
  if ((-r &PathExpand($ArgFlt) and -s &PathExpand($ArgFlt))
     or $ArgFlt eq '-') {
    $ArgFlt = ($ArgFlt eq '-') ? '-' : &PathExpand($ArgFlt);
    my $hFlt = FileHandle->new($ArgFlt);
    while (<$hFlt>) { m/^\S+/ and push @flt,$& }
    if (@flt) {
      $debug and printf STDERR "%s. %d entries in filter file %s\n", &MySub, int(@flt), $ArgFlt||"''";
    } else {
      die sprintf "ERROR: unable to read filter file %s\n", $ArgFlt||"''";
    }
  } else {
    $debug and printf STDERR "%s. -filter switch argument interpreted as plain identifier: %s\n", &MySub, $ProgOpt{-filter}||"''";
    @flt = ( $ProgOpt{-filter} );
  }

  # get primary list from input file(s)
  my ($pCol,@table);
  foreach my $PathIn (@ArgIn) {
    my $pTabCurr;
    ($pTabCurr,$pCol) = &PlainToTable ($PathIn, -TabType=>'AA',
      -delimCol=>$ProgOpt{-delimCol}, -comments=>1, -debug=>$dbg2);
    if ($pTabCurr) {
      $debug and printf STDERR "%s. %d data line%s in input file %s\n", &MySub,
        int(@$pTabCurr), (@$pTabCurr==1)?'':'s', $PathIn||"''";
      push @table, @$pTabCurr;
    } else {
      printf STDERR "WARNING: no list entries in input file %s\n", $PathIn||"''";
      next;
    }
  }

  # do filtering, output
  if (int (@$pCol)) {
    printf "# column labels:\n# %s\n", join("\t",@$pCol);
  }
  foreach my $pLine (@table) {
    my $line = ($ProgOpt{-OutTabType} eq 'AA') ? join("\t",@$pLine) : $$pLine[0];
    my $bMatch = int (grep { $line=~m/$_/ } @flt);
    if (($bActNeg and $bMatch) or (!$bActNeg and !$bMatch)) {
      print  "$line\n";
    } elsif ($debug) {
      printf STDERR "  entry %s %s\n",
        $$pLine[0]||"''", $bActNeg?'not matched':'filtered';
    }
  }
}


# prepare list of unique list entries
#
# INTERFACE
# - argument 1+: input file(s)
#
# - global options:
#   -ActNegative switches the action from selecting unique entries to selecting
#                non-unique entries
#   -debug       [STD]
#   -OutTabType  [STD]
#
# DESCRIPTION
# - We don't use &MainLib::Misc::unique here, cause:
#   - we have a chance of independent process debugging
#   - we are able to do queued output without keeping temporary copies of
#     the whole table data structure
# - order of output will be the same as in input
#
sub ProgUnique {
  my (@ArgIn) = @_;
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;

  # loop over input file(s)
  my %unique;
  foreach my $PathIn (@ArgIn) {

    # read input file
    my $pTabCurr = &PlainToTable ($PathIn, -TabType=>'AA',
      -delimCol=>$ProgOpt{-delimCol}, -comments=>1, -debug=>$dbg2);
    unless ($pTabCurr) {
      printf STDERR "WARNING: no list entries in input file %s\n", $PathIn||"''";
      next;
    }
    $debug and printf STDERR "%s. %d data line%s in input file %s\n", &MySub,
      int(@$pTabCurr), (@$pTabCurr==1)?'':'s', $PathIn||"''";

    # loop over entries (table lines)
    foreach my $pLine (@$pTabCurr) {

      # has already occurred?
      # -unique: filter entry
      # -nonunique: sample entry
      if ($unique{$$pLine[0]}) {
        if ($ProgOpt{-ActNegative}) {
          push @{$unique{$$pLine[0]}}, $pLine;
        } else {
          $debug and printf STDERR "%s. filtered redundant entry %s in file %s\n", &MySub, $$pLine[0]||"''", $PathIn;
          next;
        }
      }

      # add to index holding encountered entries
      # -unique: print first occurrence of entry, count occurrence of entry
      # -nonunique: remind entry
      else {
        if ($ProgOpt{-ActNegative}) {
          $unique{$$pLine[0]} = [ $pLine ];
        } else {
          printf "%s\n",
            ($ProgOpt{-OutTabType} eq 'AA') ? join("\t",@$pLine) : $$pLine[0];
          $unique{$$pLine[0]} = 1;
        }
      }
    }
  }

  # loop over non-unique entries
  if ($ProgOpt{-ActNegative}) {
    foreach my $pLine (map{@{$_||[]}} values %unique) {
      printf "%s\n",
        ($ProgOpt{-OutTabType} eq 'AA') ? join("\t",@$pLine) : $$pLine[0];
    }
  }
}


# randomize lines
#
# INTERFACE
# - argument 1+: input file(s)
#
# - global options:
#   -debug      [STD]
#   --reduce
#
sub ProgRandom {
  my (@ArgIn) = @_;
  my $debug = $ProgOpt{-debug};
  my $reduce = $ProgOpt{-var}{reduce};

  # get list of lines from input file(s)
  my @list;
  foreach my $PathIn (@ArgIn) {
    my $hIn = FileHandle->new($PathIn);
    my $lct=0;
    while (<$hIn>) {
      ++ $lct;
      if ($reduce and rand()>$reduce) { next }
      push @list,$_;
    }
    if ($lct) {
      $debug and printf STDERR "%s. %d data lines in input file %s\n", &MySub,
        $lct, $PathIn||"''";
    } else {
      printf STDERR "WARNING: no list entries in input file %s\n", $PathIn||"''";
      next;
    }
  }

  # randomize, output
  foreach (map{@{$_||[]}} &RandArrayOrder(\@list)) { print }
}


################################################################################
# table structures & formats
################################################################################


# merge tables
#
sub ProgTabColComb {
  my (@ArgTab) = @_;
  my ($debug, $dbg2, $PathTab);
  my ($pTab, $pCol, $pTabNext, $pColNext);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;

  # load 1st table
  $PathTab = shift @ArgTab;
  ($pTab, $pCol) = &PlainToTable ($PathTab, -TabType=>'AC',
    -delimCol=>$ProgOpt{-delimCol}, -comments=>1, -debug=>$dbg2);

  # load/add next tables
  while ($PathTab = shift @ArgTab) {
    ($pTabNext, $pColNext) = &PlainToTable ($PathTab, -TabType=>'AC',
      -delimCol=>$ProgOpt{-delimCol}, -comments=>1, -debug=>$dbg2);
    push @$pTab, @$pTabNext;
    push @$pCol, @$pColNext;
  }
  $pTab = &TableConvert ('AC', 'AA', $pTab,
    -ColLabel=>$ProgOpt{-ColOrder}, -debug=>$dbg2);
  if ($ProgOpt{-OutTabType} and $ProgOpt{-OutTabType} ne 'AA') {
    $pTab = &TableConvert ('AA', $ProgOpt{-OutTabType}, $pTab, -debug=>$dbg2);
  }

  # output - line loop
  printf "# column labels:\n# %s\n", join("\t",@$pCol);
  foreach my $pLine (@$pTab) {
    printf "%s\n", join("\t",@$pLine);
  }
}


# merge tables
#
sub ProgTabColMerge {
  my (@ArgTab) = @_;
  my ($debug, $dbg2, $ColMergePos, $PathTab);
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $ColMergePos = $ProgOpt{-var}{ColTgt};

  my ($pTab, $pCol, $pTabPlus, $pColPlus, $pLine);

  # load 1st table
  # - it would be nice to have the maximum field number reflected by @$pCol
  $PathTab = shift @ArgTab;
  ($pTab, $pCol) = &PlainToTable ($PathTab, -TabType=>'AA',
    -delimCol=>$ProgOpt{-delimCol}, -comments=>1, -debug=>$dbg2);
  if (! @$pCol) {
    @$pCol = (('A' .. 'Z'))[0 .. (int(@{$$pTab[0]})-1)];
  }

  # load 2nd table - accessed by indexation
  # - it would be nice to have the maximum field number reflected by @$pCol
  $PathTab = shift @ArgTab;
  ($pTabPlus, $pColPlus) = &PlainToTable ($PathTab, -TabType=>'HIA',
    -delimCol=>$ProgOpt{-delimCol}, -comments=>1, -debug=>$dbg2);
  shift @$pColPlus;
  if (defined ($ColMergePos)) {
    splice @$pCol, $ColMergePos, 0, @$pColPlus;
  } else {
    push @$pCol, @$pColPlus;
  }

  # output table header
  printf "# column labels:\n# %s\n", join("\t",@$pCol);
  # merge/output - line loop
  foreach $pLine (@$pTab) {

    # merge to defined column position
    if (defined ($ColMergePos)) {
      if (@{$$pTabPlus{$$pLine[0]}} < @$pColPlus) {
        $#{$$pTabPlus{$$pLine[0]}} = int(@$pColPlus) - 1;
      }
      splice @$pLine, $ColMergePos, 0, @{$$pTabPlus{$$pLine[0]}};
    }

    # merge by push
    else {
      if (exists($$pTabPlus{$$pLine[0]})) {
        push @$pLine, @{$$pTabPlus{$$pLine[0]}};
      }
    }

    # output - line loop
    printf "%s\n", join("\t",@$pLine);
  }
}


# change column order in table
#
# INTERFACE
# - argument 1: path of table file
#
# - global options:
#   -debug      [STD]
#   -ColOrder   column order, from ProgMode argument
#   -var=splice delete columns, that are not listed in column order argument
#
# DESCRIPTION
# - The default column order is the same as for the input table.
#
sub ProgTabColOrder {
  my ($PathTab) = @_;
  my ($debug, $dbg2, $bTimer, $time);
  my ($pTab, $pCol, $pColLate);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $bTimer = $ProgOpt{-timer};

  # load table
  $bTimer and $time = &Sum ((times)[0,2]);
  ($pTab, $pCol) = &PlainToTable ($PathTab, -TabType=>'HCA',
    -delimCol=>$ProgOpt{-delimCol}, -comments=>1, -debug=>$dbg2);
  if (! int(@$pCol)) {
    $pCol = [ ('A'..'Z')[0 .. int(keys %$pTab)] ];
  }
  $debug and printf STDERR "%s. original column labels: %s\n", &MySub,
    join("\t",@$pCol);
  if ($bTimer) {
    printf "%s. CPU time for table load: %.3f\n", &MySub, &Sum((times)[0,2])-$time;
    $time = &Sum ((times)[0,2]);
  }

  # custom order, custom table representation
  {
    # prepare custom order of columns
    my %ColChooseIdx = map { ($_=>1) } @{$ProgOpt{-ColOrder}};
    my $pColNew=[];
    push @$pColNew, @{$ProgOpt{-ColOrder}};
    if ($ProgOpt{-var}{splice}) {
      foreach (grep{ !$ColChooseIdx{$_} }@$pCol) {
        delete $$pTab{$_};
      }
    } else {
      push @$pColNew, grep{ !$ColChooseIdx{$_} }@$pCol;
    }

    # convert table - other target types than A1 do not make sense, here
    if ($ProgOpt{-OutTabType} and $ProgOpt{-OutTabType} eq 'A1') {
      foreach (grep{ $_ ne $$pColNew[0] }@$pColNew) {
        delete $$pTab{$_};
      }
      $pColNew = [ $$pColNew[0] ];
    }

    $pCol = $pColNew;
  }
  $debug and printf STDERR "%s. ordered column labels: %s\n", &MySub,
    join("\t",@$pCol);

  # convert table
  # conversions other than to A1 do not make sense, here
  ($pTab, $pColLate) = &TableConvert ('HCA', 'AA', $pTab,
    -ColLabel=>$pCol, -debug=>$dbg2);
  $debug and printf STDERR "%s. column labels after conversion: %s\n", &MySub,
    join("\t",@$pColLate);
  if ($bTimer) {
    printf "%s. CPU time for table conversion: %.3f\n", &MySub, &Sum((times)[0,2])-$time;
    $time = &Sum ((times)[0,2]);
  }

  # output - line loop
  printf "# column labels:\n# %s\n", join("\t",@$pColLate);
  foreach my $pLine (@$pTab) {
    printf "%s\n", join("\t",@$pLine);
  }
}


# format HTML table to CSV
#
# INTERFACE
# - argument 1: path of table file
#
# DESCRIPTION
# - limit: this code cannot handle nested tables. It will get confused by that.
#
sub ProgTabHtml {
  my ($PathIn) = @_;
  my $debug = $ProgOpt{-debug};

  # read input file
  my $doc;
  unless (defined ($doc=&ReadFile($PathIn))) {
    die sprintf "ERROR: unable to read input file %s\n", $PathIn||"''";
  }

  # loop over tables
  while ($doc =~ m/<TABLE/ig) {
    my $PosOff = pos($doc);
    my (@table, $CtField);
    my $CtLine = -1;

    # loop over table rows/fields
    while ($doc =~ m/<([HT][DHR]|\/TABLE)(?:[^>]*)>/ig) {
      my $ItemType = $1;
      #my $ItemOff = pos($doc);

      # new table row
      if ($ItemType =~ m/[HT]R/i) {
        $CtLine ++;
        $CtField = 0;
        next;
      }

      # new table field
      if ($ItemType=~m/[HT][DH]/i and $doc=~m{(.*?)(?:</[HT][DH]>)}igm) {
        my $ItemCont = $1;
        $ItemCont =~ s/\&nbsp;/ /g;
        $ItemCont =~ s/(\s|\n)+/ /g;
        { # remove HTML tags to yield pure values
          $ItemCont =~ s{<A(?:[^>]*)>(.*?)</A>}{$1}ig;
          $ItemCont =~ s/ +/ /g;
        }
        $ItemCont =~ s/^ +//; $ItemCont =~ s/ +$//;
        $table[$CtLine][$CtField] = $ItemCont;
        $CtField ++;
        next;
      }

      # end table
      if ($ItemType=~m/\/TABLE/i) { last }
    }

    # output table
    $debug and printf STDERR "%s. found table at pos. %d\n", &MySub, $PosOff;
    printf "# table starting pos. %d\n", $PosOff;
    foreach my $pLine (@table) {
      printf "%s\n", join("\t",@$pLine);
    }

    # position to continue iteration
    # avoid an endless loop!
    pos($doc) = $PosOff;
  }
}


# format table to HTML
#
# INTERFACE
# - argument 1: path of table file
#
sub ProgTab2Html {
  my ($PathTab) = @_;
  my ($pTab, $pCol);

  # load table
  ($pTab, $pCol) = &PlainToTable ($PathTab, -TabType=>'AA',
    -delimCol=>$ProgOpt{-delimCol}, -comments=>1);

  # start table
  print  "<TABLE>\n";

  # line loop
  foreach my $pLine (@$pTab) {
    print  "  <TR>\n";

    # cell loop
    foreach my $val (@$pLine) {
      printf "    <TD>%s</TD>\n", &HtmlCharEncode($val);
    }

    # finish line
    print  "  </TR>\n";
  }

  # end table, tidy up
  print  "</TABLE>\n";
}


# list of hit pairs from Blast.pl -ListQuery table
#
# INTERFACE
# - argument 1: path of table file
#
# DESCRIPTION
# - the table header MUST be part of the table input.
#
sub ProgTabBlastPair {
  my ($PathTab) = @_;
  my ($debug);
  my ($pTab, @CurrID, @ID, %HadID);

  # function parameters
  $debug = $ProgOpt{-debug};

  # load table
  $pTab = &PlainToTable ($PathTab, -TabType=>'AH',
    -delimCol=>$ProgOpt{-delimCol}, -comments=>1);

  # reformat
  foreach my $pLine (sort { $b->{HighestScore} <=> $a->{HighestScore} } @$pTab) {
    if ($$pLine{QueryID} and $$pLine{NmbMatch}) {
      if ((@CurrID = split (/ +/, $$pLine{Matches})) > 3) {
        push @ID, splice (@CurrID, 0, 3);
      } else {
        push @ID, @CurrID;
      }
      push @ID, $$pLine{QueryID};
    }
  }

  # sort, filter, output
  foreach (@ID) {
    $HadID{$_} or print "$_\n";
    $HadID{$_} ++;
  }
}


# filter hit pairs for centromer seq motifs according to appropriate topology
#
# INTERFACE
# - argument 1: path of table file, TAB-delimited format expected
#
# DESCRIPTION
# - this was an effort to construct complex (yeast-like centromer) motif hits
#   from elementary motif hits (CEN1, CEN3)
#
sub ProgTabCentromere {
  my ($PathTab) = @_;
  my ($debug);
  my (@column, $pTab);
  my ($CtLine, $CtBack, $pLinePre);

  # function parameters
  $debug = $ProgOpt{-debug};

  # load table
  @column = qw(contig offset score seq orient mtf);
  $pTab = &PlainToTable ($PathTab, -TabType=>'AH',
    -ColLabel=>\@column, -comments=>1);

  # sort
  @$pTab = sort {
    $$a{contig} cmp $$b{contig} or
    $$a{orient} <=> $$b{orient} or
    $$a{offset} <=> $$b{offset} or
       $$a{mtf} cmp $$b{mtf} or
  1 } @$pTab;
  @column = qw(contig SumScore orient Cen13dist Cen1off Cen1score Cen1seq Cen3off Cen3score Cen3seq);
  print  "# column labels:\n";
  printf "# %s\n", join ("\t", @column);

  # line loop
  for ($CtLine=1; $CtLine<@$pTab; do { $CtLine++; $CtBack=1; }) {
    if ($CtLine-$CtBack < 0) { next }
    $pLinePre = $$pTab[$CtLine-$CtBack];
    my $pLine = $$pTab[$CtLine];

    # need at least two hits per contig
    if ($$pLine{contig} ne $$pLinePre{contig}) { next }

    # need appropriate distance of hits
    if ($$pLine{offset} - $$pLinePre{offset} < 80) { $CtBack++; redo; }
    if ($$pLine{offset} - $$pLinePre{offset} > 320) { next }

    # need pair of hits (ordered): Cen1, Cen3
    # NOTE: positional values always refer to the plus-stranded sequence
    if ($$pLine{mtf} eq 'Cen1') { next }
    if ($$pLine{mtf} eq $$pLinePre{mtf}) { $CtBack++; redo; }

    # need appropriate (parallel) orientation of hits
    if ($$pLine{orient} != $$pLinePre{orient}) { $CtBack++; redo; }

    # reformat line, output
    $$pLinePre{SumScore} = $$pLinePre{score} + $$pLine{score};
    $$pLinePre{Cen13dist} = abs ($$pLinePre{offset} - $$pLine{offset});
    (@{$pLinePre}{'Cen1off','Cen1score','Cen1seq','Cen3off','Cen3score','Cen3seq'}) =
      map { $_->{offset}, $_->{score}, $_->{seq} }
      sort { $$a{mtf} cmp $$b{mtf} } $pLinePre, $pLine;
    printf "%s\n", join ("\t", @{$pLinePre}{@column});
  }
}


# recalculate tabular plot data to plot of integral
#
# INTERFACE
# - argument 1: path of plot data table 1
# - argument 2: path of plot data table 2
# - argument 3: subroutine string
#
sub ProgTabCrosscalc {
  my (@PathTab, $sEval);
     ($PathTab[0], $PathTab[1], $sEval) = @_;
  my ($debug);
  $debug = $ProgOpt{-debug};

  my (@plot, %xval);

  # load tables
  foreach my $CtTab (0,1) {
    $plot[$CtTab] = Math::Plot2D->new($PathTab[$CtTab],
      -delimCol=>$ProgOpt{-delimCol}, -comments=>1, -debug=>$debug);
    unless ($plot[$CtTab] and @{$plot[$CtTab]}) {
      printf STDERR "%s. ERROR: no plot data from table file %s\n", &MySub,
        $PathTab[$CtTab]||"''";
      return;
    }
  }

  # recalculate plot
  %xval = map{ ($_=>$_) } map{ $_->[0] } map{ @{$plot[$_]{pdata}} } (0,1);
  $debug and printf STDERR "%s. total of %d unique x values\n", &MySub,
    int(keys(%xval));
  $debug and printf STDERR "%s. evaluating %s\n", &MySub, int(keys(%xval));
  foreach my $ItX (sort values %xval) {
    my $x = $ItX;
    my $y1 = $plot[0]->Interpolate($ItX);
    my $y2 = $plot[1]->Interpolate($ItX);
    my $ynew = eval ($sEval);
    if (! defined($ynew)) { next }
    printf "%s\t%s\n", $x, $ynew;
  }
}


# recalculate tabular plot data to plot of integral
#
# INTERFACE
# - argument 1: path of table file
#
sub ProgTabCumul {
  my ($PathTab) = @_;
  my ($debug);
  $debug = $ProgOpt{-debug};

  my ($pTab, $pPlot, $integral);

  # load table
  $pPlot = Math::Plot2D->new($PathTab,
    -delimCol=>$ProgOpt{-delimCol}, -comments=>1, -debug=>$debug);
  unless ($pPlot and @$pPlot) {
    printf STDERR "%s. ERROR: no plot data from table file %s\n", &MySub, $PathTab||"''";
    return;
  }

  # recalculate plot
  my $resol = $ProgOpt{-var}{resol} || 500;
  my $rev = $ProgOpt{-var}{reverse};
  my $xlast;
  my $xoff = $xlast = ($rev ? scalar $pPlot->Xmax() : scalar $pPlot->Xmin());
  print  "$xoff\t0\n";
  my $xstep = ($pPlot->Xmax() - $pPlot->Xmin()) / $resol * ($rev ? -1 : 1);
  $debug and printf STDERR "%s. x range [%s,%s], x_off %s, x_step %s\n", &MySub,
    $pPlot->Xmin(), $pPlot->Xmax(), $xoff, $xstep;
  my $xpos = $xoff + $xstep;
  my $cti;
  for ($cti=0; $cti<$resol; $cti++) {
    $integral += $pPlot->Integral (sort($xpos,$xlast));
    # output
    print  "$xpos\t$integral\n";
    # continue
    $xlast = $xpos;
    $xpos += $xstep;
  }
}


# recalculate Plot2D data structure
#
# INTERFACE
# - argument 1: path of data structure
# - argument 2: path of data structure
#
sub ProgTabDataPlot2D {
  my ($PathDat1, $PathDat2) = @_;
  my ($pDat1, $pDat2);

  # load table
  $pDat1 = &DataRead ($PathDat1);
  $pDat2 = &DataRead ($PathDat2);

  # reformat I
  splice (@{$$pDat1{plot}}, 1, 0, $$pDat2{plot}[0]);
  $$pDat1{plot}[1]{ReprColor} = 'blue';
  $$pDat1{plot}[1]{DataRange}{x} = $$pDat1{plot}[0]{DataRange}{x};

  # reformat II
  foreach my $pLine (@{$$pDat1{scale}}) {
    if ($$pLine{PlotNum} == 1) {
      $$pLine{PlotNum} = 2;
    }
  }

  # output
  &DataPrint ($pDat1);
}


# re-calculate result table for DDT-A cDNA screen
#
# INTERFACE
# - argument 1: path of table file, TAB-delimited format expected
#
sub ProgTabDdtScreen {
  my ($PathTab) = @_;
  my ($pTab, $pCol);
  my (%CloneIdx, %PosIdx, $ItClone, $ItExchg);
  my ($MaxClone, $MaxPos, $ItPos, %line, $StrPos);

  # function parameters

  # load table
  ($pTab, $pCol) = &PlainToTable ($PathTab, -TabType=>'AH',
    -ColShift=>1, -comments=>1);

  # reformat list of exchanges
  foreach my $pLine (@$pTab) {

    # skip clones exerting fatal InDels
    if ($$pLine{fatale_InDels} > 0.5) { next }
    # skip clones exerting preliminary stop
    if ($$pLine{vorzeitiges_Stop}) { next }

    # reformat list of exchanges
    $CloneIdx{$$pLine{'Klon-Namen'}} = [ split (/ *, */, $$pLine{Liste_Austausche}) ];
  }
#  # output
#  &DataPrint (\%CloneIdx);

  foreach $ItClone (keys %CloneIdx) {
    foreach $ItExchg (@{$CloneIdx{$ItClone}}) {
      # reformat positional index of exchanges for
      # - current clone (loop 1)
      # - current position (loop 2)

      # convert exchange entry
      if ($ItExchg =~ m/^(\d+) *([A-Z])->([A-Z]) *\((.+)\)/) {
        $PosIdx{$1}{$ItClone} = $3;
        if ($4 ne '-') {
          if (index ($4, '*') >= 0) {
            $PosIdx{$1}{$ItClone} = '<FONT COLOR=#D09070>'. $PosIdx{$1}{$ItClone} .'</FONT>';
          } else {
            $PosIdx{$1}{$ItClone} = '<FONT COLOR=#90D070>'. $PosIdx{$1}{$ItClone} .'</FONT>';
          }
        }
        $PosIdx{$1}{consensus} ||= $2;
      }

      # convert InDel entry
      elsif ($ItExchg =~ m/^(\d+) *(del|ins) *\(.+\)/) {
        $PosIdx{$1}{$ItClone} = '*';
        $PosIdx{$1}{$ItClone} = '<FONT COLOR=#90D070>'. $PosIdx{$1}{$ItClone} .'</FONT>';
        $PosIdx{$1}{consensus} ||= '?';
      }

      # unknown entry format
      else {
        printf STDERR "RegExp mismatch on exchange entry: %s\n", $ItExchg;
        next;
      }
    }
  }

  # print alignment of exchanges
  $MaxClone = &Max (map { length($_) } keys (%CloneIdx), 'consensus');
  $MaxPos = &Max (map { length($_) } keys (%PosIdx));
  foreach (1 .. $MaxPos) {
    $line{'.head'.$_} = ' ' x ($MaxClone + 2);
  }
  $line{'.head'.($MaxPos+1)} = '';
  foreach $ItClone (keys (%CloneIdx), 'consensus') {
    $line{$ItClone} = sprintf ("%${MaxClone}s  ", $ItClone);
  }
  foreach $ItPos (sort { $a <=> $b } keys %PosIdx) {
    $StrPos = sprintf ("%${MaxPos}d", $ItPos);
    foreach (1 .. $MaxPos) {
      $line{'.head'.$_} .= (split(//,$StrPos))[$_-1];
    }
    foreach $ItClone (keys (%CloneIdx), 'consensus') {
      $line{$ItClone} .= $PosIdx{$ItPos}{$ItClone} || '.';
    }
  }
  print  "<HTML><BODY BGCOLOR=#FFFFFF><PRE>\n";
  print  map { "$line{$_}\n" } sort keys (%line);
  print  "</PRE></BODY></HTML>\n";
}


# combine a set of pairwise distance tables
#
# INTERFACE
# - argument 1: regexp to select table files from current directory
#               bracketed phrase contains set-specific parameter.
#               Tables are expected to be in TAB-delimited format
#
sub ProgTabDistSetComb {
  my ($ArgSlc) = @_;
  my ($debug, $dbg2);
  my (@PathSrc, $PathTab, $TabParam, $pTab, $pCol);
  my ($key1, $key2);

  # function parameters
  $debug = $ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;

  # get paths of table files
  @PathSrc = &ReadDir ('./', -select=>$ArgSlc);
  $debug and printf STDERR "%s. matching files: %d\n", &MySub, int @PathSrc;

  # loop over tables, load into 2D hash
  foreach $PathTab (@PathSrc) {
    $PathTab =~ m/$ArgSlc/;
    $TabParam = $1;
    $debug and printf STDERR "%s. loading table: %s, param %s\n", &MySub,
      $PathTab, $TabParam;
    ($pTab, $pCol) = &PlainToTable ($PathTab, -TabType=>'HIH',
      -comments=>1, -debug=>$dbg2);

    # loop over fields
    foreach $key1 (sort grep { $_ ne 'consensus' } keys %$pTab) {
      my $pLine = $$pTab{$key1};
      foreach $key2 (sort grep { $_ ne 'consensus' and $_ ne 'id' } keys %$pLine) {

        # output pairwise distance
        $$pTab{$key1}{$key2} and printf "%s-%s\t%s\t%s\n",
          $key1, $key2, $TabParam, $$pTab{$key1}{$key2};
      }
    }
  }
}


# re-format HAPPY map table to fastA file
#
# INTERFACE
# - argument 1:  path of table file, TAB-delimited format expected
# - argument 2*: optional: path of marker index file
#
sub ProgTabHappyFrom {
  require SeqLab::SeqFormat; SeqLab::SeqFormat->import qw(&SeqentryToFasta);
  my ($PathTab, $PathIdx) = @_;
  my ($debug);
  my ($pTab, $pIdx);

  # function parameters
  $debug = $ProgOpt{-debug};

  # load table, load index
  $pTab = &PlainToTable ($PathTab, -TabType=>'AH', -ColShift=>1, -comments=>1);
  shift @$pTab;
  if ($PathIdx) {
    $pIdx = &PlainToTable ($PathIdx, -TabType=>'HIA', -ColShift=>1, -ColIdx=>1, -comments=>1);
    $debug and printf STDERR "%s. first entries in index: %s\n", &MySub, join (' ', (keys %$pIdx)[0..5]);
  }

  ##############################################################################
  # format from HAPPY map Chr6

  # reformat - line loop
  foreach my $pLine (@$pTab) {

    # some field conversions
    $$pLine{'Sequence Name'} =~ s/(\.seq)?[\s\r\n]*$//i;
    $$pLine{'Sequence Name'} =~ tr/ / /s;

    $$pLine{'Linkage Group'} ||= 'UNK';

    $$pLine{'Marker ID'} ||= $$pIdx{$$pLine{'Sequence Name'}}[0];
    $debug and printf STDERR "%s. first values in indexed line: %s %s\n", &MySub,
      $$pIdx{$$pLine{'Sequence Name'}}[0], $$pIdx{$$pLine{'Sequence Name'}}[1];

    # change to sequence data structure
    my $pSeq = {
      id       => $$pLine{'Marker ID'},
      descr    => sprintf ('from %s, mapped to %s %s',
                  $$pLine{'Sequence Name'},
                  $$pLine{'Linkage Group'},
                  (defined($$pLine{'Marker Pos'} and $$pLine{'Marker Pos'} >= 0)) ? sprintf ('%.3f', $$pLine{'Marker Pos'}) : '',
                  ),
      sequence => $$pLine{Sequence},
    };

    # output sequence
    print  &SeqentryToFasta ($pSeq, -pure=>'DNA5', -upper=>1);
  }
}


# convert rich fastA to HAPPY mapping submission
#
# INTERFACE
# - argument 1+: path of fastA
#
sub ProgTabHappyTo {
  require SeqLab::SeqStreamIn;
  my (@PathSeq) = @_;
  my ($debug);
  my (@column);
  my ($pSeqQueue, $pSeq, %line, %probab);

  # function parameters
  $debug = $ProgOpt{-debug};

  # table header
  @column = (
    'SEQUENCE NAME', 'SEQUENCE',
    'SEGMENT NAME', 'SEGMENT TYPE',
    'SEQUENCE SOURCE', 'CHROMOSOME', '% CHROMOSOMAL CONFIDENCE', 'CHROMOSOMAL ASSIGNMENT METHOD',
    'SUBMITTED BY',
    );
  @line{'SEGMENT TYPE', 'SEQUENCE SOURCE', 'CHROMOSOMAL ASSIGNMENT METHOD', 'SUBMITTED BY'} =
    ('scaffold', 'Chr3:pUC18', 'probabilistics Szafranski-K', 'GSC Jena');
  printf "%s\n", join ("\t", @column);

  # loop over sequences
  $pSeqQueue = SeqLab::SeqStreamIn->new(@PathSeq);
  while ($pSeq = $pSeqQueue->GetNext()) {

    # basic fields
    $debug and printf STDERR "%s -> ", $$pSeq{id};
    $$pSeq{id} =~ m/-[35](r?)$/ and $$pSeq{id} = $` . $1;
    $debug and printf STDERR "%s\n", $$pSeq{id};
    $line{'SEQUENCE NAME'} = $$pSeq{id};
    if ($$pSeq{descr} =~ m/from (scaffold|pseudo[ -]contig) ([^\s,]+)/) {
      $line{'SEGMENT NAME'} = $1;
    } else {
      $line{'SEGMENT NAME'} = $$pSeq{id};
      $line{'SEGMENT NAME'} =~ s/(\d)r?$/${1}/e;
      $line{'SEGMENT NAME'} =~ s/(\d)a[lr]$/${1}/e;
    }
    $line{SEQUENCE} = $$pSeq{sequence};

    # probabilistic fields
    while ($$pSeq{descr} =~ m/(Chr\d+) ([-.\d]+)/g) {
      $probab{$1} = 10 ** $2;
    }
    $line{CHROMOSOME} = (sort { $probab{$b} <=> $probab{$a} } keys %probab)[0];
    $line{'% CHROMOSOMAL CONFIDENCE'} = sprintf ('%d', 100 * $probab{$line{CHROMOSOME}});

    # output table line for sequence entry
    printf "%s\n", join ("\t", @line{@column});
  }
}


# re-format HAPPY map table to fastA file
#
# INTERFACE
# - argument 1+: path(s) of table file(s)
#
sub ProgTabInvPcr {
  my (@ArgRpt) = @_;
  my ($debug);
  my (@column, %entry);

  # function parameters
  $debug = $ProgOpt{-debug};
  @column = qw(locus date enz1 enz2 PrimerPair TO_product TO_RingSize_fake TO_ProductSize);
  printf "# column labels:\n# %s\n", join ("\t", @column);

  # loop over reports
  foreach my $PathRpt (@ArgRpt) {
    my $hInRpt = FileHandle->new($PathRpt);
    my $line;

    # line loop
    while (defined ($line=<$hInRpt>)) {
      $line =~ m/inv-/ or next;

      # sample fields
      $'    =~ m/^(\w+)\((\w+)\)/ and
        $entry{enz1} = $1 and $entry{enz2} = $2;
      $entry{date} = '('. substr (&TimeStr (-time=>&ftime($PathRpt), -format=>'CompactComp'), 0, 8);
      $line =~ m/=> (>?\d+) bp/ and
        $entry{TO_ProductSize} = $1;
      $line =~ m/JP\d+\/(JP)?\d+/ and
        $entry{PrimerPair} = $&;
      $line =~ m/JPCR\w+/ and
        $entry{TO_product} = "($&";
      $'    =~ m/\(([^)]+)\)/ and
        $entry{locus} = $1;

      # output invPCR entry
      printf "%s\n", join ("\t", @entry{@column});
      %entry = ();

    } # end line loop
  } # end report loop
}


################################################################################
# special format conversions
################################################################################


# filter error messages (quality drop) from Align.pl LOG file
#
# INTERFACE
# - argument 1: input file
#
sub ProgAlignErr1 {
  my ($PathIn) = @_;
  my ($buffer);
  my ($extract);

  # read input file
  unless (defined ($buffer=&ReadFile($PathIn))) {
    die sprintf "ERROR: unable to read input file %s\n", $PathIn||"''";
  }

  # remove page header/footer
  if ($buffer =~ m/\n(ProjectEntryAdd. Violation)/s) {
    $extract = $1 . $';
    if ($buffer =~ m/\n  (project file: .*?\n)  (project name: .*?\n)/s) {
      substr ($extract, 0, 0) = $1 . $2 . "\n";
    } else {
      printf STDERR "%s. didn't get project file/name from LOG\n", &MySub;
    }
    substr ($extract, 0, 0) = "\nAlign.pl LOG $PathIn\n";
    print  $extract;
  }
}


# filter problem entries from AlnK project file
#
# INTERFACE
# - argument 1: input file
#
sub ProgAlignErr2 {
  my ($PathIn) = @_;
  my ($buffer);
  my ($extract);

  # read input file
  unless (defined ($buffer=&ReadFile($PathIn))) {
    die sprintf "ERROR: unable to read input file %s\n", $PathIn||"''";
  }

  # remove page header/footer
  if ($buffer =~ m/\n(  problem  \{\n.*?  \}\n)/s) {
    $extract = $1;
    substr ($extract, 0, 0) = "\nAlnK project file $PathIn\n";
    print  $extract;
  }
}


# convert *blz2c.lav file to extended tabular format
#
# INTERFACE
# - argument 1: *blz2c.lav file
#
sub ProgBlzlav2tab {
  my ($FileIn) = @_;
  my ($hIn, $buffer, @aln, @column);

  # parse *blz2c.lav file
  unless ($hIn=FileHandle->new($FileIn)) {
    print STDERR "ERROR: unable to read file $FileIn\n";
    exit 1;
  }
  while ($buffer = <$hIn>) {
    $buffer =~ m/^a \{/ or next;
    while ($buffer = <$hIn=>) {
      $buffer =~ m/^ +[a-z]/ or last;
      $buffer =~ m/^ +l(( \d+)+)$/ or next;
      push @aln, [ (split (/ +/, $1))[1..5] ];
      $aln[-1][5] = &Mean ($aln[-1][2]-$aln[-1][0], $aln[-1][3]-$aln[-1][1]);
    }
    unless ($buffer =~ m/^\}/) {
      print STDERR "ERROR: didn't find closing bracket '}'. *blz2c.lav file seems to be corrupt\n";
      exit 1;
    }
  }
  undef $hIn;

  # output tab format
  @column = qw(offseq1 offseq2 endseq1 endseq2 percid alnlen);
  printf "%s\n", join ("\t", @column);
  foreach (sort { $a->[0] <=> $b->[0] or $a <=> $b } @aln) {
    printf "%s\n", join ("\t", @$_);
  }
}


# filter *blz2c.pip*.ps file
#
# INTERFACE
# - argument 1: *blz2c.pip file
# - argument 2: cut-off value for local alignment length
#
sub ProgBlzpippsFilter {
  my ($FileIn, $cutoff) = @_;
  my ($plain, %buffer, $line, @column, $len);

  # parse *blz2c.pip*.ps file
  unless ($plain = &ReadFile($FileIn)) {
    print STDERR "ERROR: unable to read file $FileIn\n";
    exit 1;
  }

  # match feature block
  while ($plain =~ m/\nplotdata\n((\d[^\n]+Line\n)+)/sg) {
    $buffer{pre} = $`;
    $buffer{post} = $';
    $buffer{lineO} = [ split (/\n/, $1) ];
    $buffer{lineF} = [ "\nplotdata" ];

    # convert/filter feature block
    while ($line = shift @{$buffer{lineO}}) {
      @column = split (/ +/, $line);
      $len = $column[2] - $column[0];
      printf STDERR "%s. feature line, offset %d\n", &MySub, $column[0];
      if ($len >= $cutoff) {
        printf STDERR "%s. entering line, feature length %d\n", &MySub, $len;
        push @{$buffer{lineF}}, $line;
      }
    }

    # next feature block
    $plain = $buffer{pre} .
      join ('', map { "$_\n" } @{$buffer{lineF}}) .
      $buffer{post};
    pos($plain) = length($buffer{pre}) + 10;
  }

  # output converted *.ps format
  print $plain;
}


# informative data line from COG server report
# *** certainly outdated ***
#
# INTERFACE
# - argument 1: input file
#
sub ProgCogLine {
  my ($PathIn) = @_;
  my ($debug);
  my ($buffer, $bAmbig, $id, $idpre, $extract, $val, $line);

  # function parameters
  $debug = $ProgOpt{-debug};

  # read input file
  unless (defined ($buffer=&ReadFile($PathIn))) {
    die sprintf "ERROR: unable to read input file %s\n", $PathIn||"''";
  }

  # ambiguous match?
  if ($buffer =~ m/Ambiguous/) {
    $bAmbig = 1;
  }

  # strip query ID
  if ($buffer =~ m/<strong>((<\/?\w([^>]|\n)*?>[\n\r]*)+)(\S+)/s) {
    $id = $4;
    $idpre = $1;
    $buffer = $';
    if ($debug) {
      $idpre =~ s/\n//g;
      printf STDERR "$idpre [%d chars left]\n", length $buffer;
    }
  } else {
    die sprintf "ERROR: regexp match failed (step 1) - probably code error - file %s\n", $PathIn||"''";
  }

  # locate result table
  if ($buffer =~ m/<table[\S\s\n]*?pet-score:[\S\s\n]*?<\/table>/mi) {
    $extract = $& . "\n";

    # locate classification letters
    if ($extract =~ m/color=#000000>\w+<\/font>/i) {
      while ($extract =~ m/color=#000000>(\w+)<\/font>/g) {
        $val = ($val ? ',':'') . $1;
      }
    } else {
      die sprintf "ERROR: regexp match failed (step 3) - probably code error - file %s\n", $PathIn||"''";
    }
  } else {
    $debug and printf STDERR "WARNING: regexp match failed (step 2) - may be code error or no matches in COG database\n";
  }

  # output parsed information
  print  $line = sprintf ("%s\t%s\t%d\n", $id, $val, $bAmbig);
}


# extract division labels from GenBank release file "gbrel.txt"
#
# INTERFACE
# - argument 1: input file
#
# DEBUG, CHANGES, ADDITIONS
# - the base count per division may exceed the value range of long int.
#   This must be minded when using printf ('%d", ...). We might use
#   Math::BigInt in the future.
#
sub ProgGbRel2div {
  my ($PathIn) = @_;
  my ($debug);
  my ($doc, $bErrRE, $posOff, $posEnd, %div);

  # function parameters
  $debug = $ProgOpt{-debug};

  # read input file
  unless ($doc = &ReadFile($PathIn)) {
    die sprintf "ERROR: unable to read input file %s\n", $PathIn||"''";
  }

  # localize and extract chapter of "Per-Division Statistics"
  $doc =~ m/ Per-Division Statistics\s*$/gm or $bErrRE|=1;
  $doc =~ m/Division\s+Entries\s+Bases/g or $bErrRE|=2;
  $doc =~ m/^\b/gm or $bErrRE|=4;
  $posOff = pos($doc);
  $doc =~ m/^$/gm or $bErrRE|=8;
  if ($bErrRE) {
    die sprintf "ERROR: parsing error, code %d\n", $bErrRE;
  }
  $posEnd = pos($doc);
  $doc = lc (substr ($doc, $posOff, $posEnd-$posOff));
  $debug and printf STDERR "%s. extracted chapter:\n%s\n", &MySub, $doc;

  # extract division labels, sizes from lines of sequence packages
  $doc =~ s/^gb//gm;
  while ($doc =~ m/^([a-zA-Z]+)\d*\s+(\d+)\s+(\d+)/gm) {
    $div{$1}{labellc} = lc $1;
    $div{$1}{labeluc} = uc $1;
    $div{$1}{packages} ++;
    $div{$1}{entries} += $2;
    $div{$1}{bases} += $3;
  }

  # output division statistics
  # - note: $div{$1}{bases} is a large number (possibly a Math::BigInt object.
  #   Therefore, the printf format descriptor must be '%s' not '%d'.
  foreach (sort (keys %div)) {
    printf "%s\t%s\t%d\t%d\t%s\n", $div{$_}{labellc}, $div{$_}{labeluc},
      $div{$_}{packages}, $div{$_}{entries}, $div{$_}{bases};
  }
}


# extract release number from GenBank release file "gbrel.txt"
#
# INTERFACE
# - argument 1: input file
#
sub ProgGbRel2vers {
  my ($PathIn) = @_;
  my ($doc, $bErrRE, %data);

  # read input file
  unless ($doc = &ReadFile($PathIn,-LineLimit=>5)) {
    die sprintf "ERROR: unable to read input file %s\n", $PathIn||"''";
  }

  # localize and extract chapter of "Per-Division Statistics"
  $doc =~ m/\n\s+(.+)$/m or $bErrRE|=1;
  $data{date} = $1;
  $doc =~ m/Release (\d+\.\d)$/m or $bErrRE|=2;
  $data{number} = $1;
  if ($bErrRE) {
    die sprintf "ERROR: parsing error, code %d\n", $bErrRE;
  }

  # output data
  printf "date  %s\nnumber  %s\n", $data{date}, $data{number};
}


# GtKtalog report to list of filenames
#
# INTERFACE
# - argument 1: input file
#
sub ProgGtKtalog {
  my ($PathIn) = @_;
  my ($debug);
  my ($hIn, @line, $buffer);
  my ($media, @folder, $file);

  # function parameters
  $debug = $ProgOpt{-debug};

  # open input file
  unless ($hIn=FileHandle->new($PathIn)) {
    die sprintf "ERROR: unable to read input file %s\n", $PathIn||"''";
  }

  # skip header lines
  GtKtalogHead: {
    unless (defined ($line[0]=<$hIn>)) {
      die sprintf "ERROR: early end of file (no body)\n";
    }

    if ($line[0] =~ m/folder location/) {
      while (defined($line[0]=<$hIn>) and $line[0]=~m/^$/) { next }
      last;
    } else {
      $debug and printf STDERR "%s. skipped line:\n%s", &MySub, $line[0];
      redo;
    }
  }

  # loop over input lines
  # keep two lines in buffer
  GtKtalogLine: {
    $line[0] ||= <$hIn> || <$hIn>;
    chomp $line[0];
    $line[1] ||= <$hIn> || <$hIn>;
    chomp $line[1];
    defined ($line[0]) or last;
    $debug and printf STDERR "%s. next lines:\n%s\n%s\n", &MySub,
      $line[0], $line[1];

    # new data media
    if ($line[1] =~ m/^-+ *$/) {
      $media = $line[0];
      @folder = ();
      $debug and printf STDERR "%s. current path: %s\n", &MySub, "[$media]";
      @line = ();
      redo;
    }

    # new subfolder
    if ($line[1] =~ m/^( *)~+ *$/) {
      $buffer = $1;
      unless ($line[0] =~ m/^ *(.+) \(/) {
        die sprintf "ERROR: missing subfolder label - aborted";
      }
      $#folder = length($buffer) / 2 - 1;
      $folder[length($buffer) / 2 - 1] = $1;
      $debug and printf STDERR "%s. current path: %s\n", &MySub, join ('/', "[$media]", @folder);
      @line = ();
      redo;
    }

    # file entry
    if ($line[0]) {
      unless ($line[0] =~ m/^( *)(.+)$/) {
        die sprintf "ERROR: missing file label - aborted";
      }
      $buffer = $1;
      $file = $2;
      if (@folder != (length($buffer) / 2 - 1)) {
        die sprintf "ERROR: found file in wrong folder level - aborted";
      }
      $debug and printf STDERR "%s. current path: %s\n", &MySub, join ('/', "[$media]", @folder, $file);
      printf "%s\n", join ('/', "[$media]", @folder, $file);
      $line[0] = $line[1];
      $line[1] = '';
      redo;
    }
  }
}


# convert man page
#
# INTERFACE
# - argument 1: input file
#
sub ProgMan {
  my ($PathIn) = @_;
  my ($buffer);
  my ($RegexpHeader, $RegexpFooter);

  # read input file
  unless (defined ($buffer=&ReadFile($PathIn))) {
    die sprintf "ERROR: unable to read input file %s\n", $PathIn||"''";
  }

  # remove page header/footer
  if ($buffer =~ m/^\n{3}([^\n]+)\n\n{3}/s) {
    $RegexpHeader = &RegexpEncode ($1);
    if ($buffer =~ m/\n\n{3}([^\n]+)\n\n{3}\n{3}$RegexpHeader\n\n{3}/s) {
      $RegexpFooter = &RegexpEncode ($1);
      $RegexpFooter =~ s/ +\d/ +\\d+/g;
    }
    if ($ProgOpt{-debug}) {
      printf STDERR "%s. RegexpHeader: %s\n", &MySub, $RegexpHeader||"''";
      printf STDERR "%s. RegexpFooter: %s\n", &MySub, $RegexpFooter||"''";
    }
    $buffer =~ s/^\n+//gs;
    $buffer =~ s/\n{3}${RegexpFooter}\n\n{3}\n{3}${RegexpHeader}\n\n{3}//gs;
  } elsif ($ProgOpt{-debug}) {
    printf STDERR "%s. didn't find any page header\n", &MySub;
  }

  # remove backspace combinations
  $buffer =~ s/(_\x08|\x08_)//g;

  # output
  print  $buffer;
}


# reorder table to a desired order of entries
#
# INTERFACE
# - argument 1:  list of entries in desired order
# - argument 2+: tabular input file(s)
#
# DESCRIPTION
# - the list of entries in desired order MUST NOT contain duplicates
# - the implementation follows two principles, in order of priority:
#   - read the input only once
#   - load as few entries as possible into memory
#
sub ProgOrder {
  my ($ford, @fin) = @_;

  # read list containing entries in desired order
  my (@ord);
  { my $hOrd=FileHandle->new($ford) or
      die sprintf "ERROR: unable to read input file %s", $ford||"''";
    while (<$hOrd>) {
      m/^[^#\s]\S*/ and push @ord,$&;
    }
  }

  # loop over tabular input files
  my (%read);
  my $bHdr=0;
  my $i=0;
  foreach my $fcurr (@fin) {
    my $hIn=FileHandle->new($fcurr) or
      die sprintf "ERROR: unable to read input file %s", $fcurr||"''";
    # re-output header
    if (! $bHdr) {
      while (defined($_=<$hIn>) and m/^#/) { print }
      if (length($_) and m/^\S+/) { $read{$&}=$_ }
      $bHdr=1;
    }
    # redo block for reading of current input file
    loop_file: {
      if ($i > $#ord) { last }
      if (exists ($read{$ord[$i]})) {
        print $read{$ord[$i]};
        delete $read{$ord[$i]};
        ++$i; redo;
      }

      # crawl until desired entry of eof
      while (defined ($_=<$hIn>)) {
        m/^\S+/ or next;
        if ($& eq $ord[$i]) {
          print; ++$i; redo loop_file;
        }
        $read{$&}=$_;
      }
    } # end loop_file
  } # end foreach my $fcurr

  # loop over tabular input files
  for ( ; $i<=$#ord; ++$i) {
    if (! exists($read{$ord[$i]})) {
      printf STDERR "WARNING: missing entry %s, desired position %d (starting at 1, ignoring header)\n",
        $ord[$i], $i+1;
      next;
    } else {
      print $read{$ord[$i]};
      delete $read{$ord[$i]};
    }
  }
}


# convert pfam2go translation table to data structure format
#
# INTERFACE
# - argument 1: input file
#
# DESCRIPTION
# - background
#   pfam2go is a data translation ressource provided by the GeneOntology
#   consortium, cf. http://www.geneontology.org/doc/GO.indices.html
#
sub ProgPfam2go {
  my ($PathIn) = @_;
  my ($hIn, $line);
  my (%PfamIdx);

  # open input file
  unless ($hIn=FileHandle->new($PathIn)) {
    die sprintf "ERROR: unable to read input file %s", $PathIn||"''";
  }

  # read and parse into dictionary
  while ($line = <$hIn>) {
    if ($line =~ m/^!/) { next }
    chomp $line;

    # parse line
    if ($line =~ m/^(Pfam:(PF\d{5})) +[a-zA-Z0-9_-]+ +> +GO:.+? ; (GO:\d{7})\b/) {
      $PfamIdx{$1} = $PfamIdx{$2} ||= [];
      push @{$PfamIdx{$1}}, $3;
    } else {
      printf STDERR "WARNING: regexp failed to match, line:\n  %s\n", $line;
    }
  }

  # save constructed dictionary as data structure file
  &DataPrint (\%PfamIdx);
}
# $Id: Convert.pl,v 1.27 2008/06/11 08:44:58 szafrans Exp $
