#! /usr/local/bin/perl
################################################################################
#
#  Sequence Laboratory
#  K's BLAST Suite
#
#  copyright (c)
#    Fritz Lipmann Institute Jena, Genome Analysis Group, 2005-2006, 2008, 2013
#    Institute of Molecular Biotechnology Jena, Dept. Genome Analysis, 1998-2004
#  author
#    Karol Szafranski, karol.szafranski@leibniz-fli.de
#
################################################################################
#
#  PURPOSE
#    See function &usage
#
#  COMMAND LINE SYNTAX
#    See function &usage
#
#  DESCRIPTION
#  - Each function has a comment header that describes at least the calling
#    interface, and possibly more.
#  - a graphical overview over subroutine dependencies and the data flow can be
#    found at $CorePath{call}{MeInstDoc}/BlastFlow.png
#
################################################################################
#
#  FUNCTIONS, DATA
#
# - MAIN
#   %GlobStore
#   $ProgFile,$ProgFstump
#   %ProgParam
#   $ProgMode,%ProgOpt,@ProgArg
#
# - usage help, command line arguments
#   &usage
#   &AddSwitch
#   &ErrorParse
#
# - basic I/O
#   &HdlQueue
#   &SeqQueue
#   &PrepOutpath
#   &PrepTabHeader
#   &ProgListParam
#
# - classical report
#   &BlastPlain
#   &BlastHtml
#
# - data flow architecture
#   &GetBlast
#   &GetBlastByseq
#   &GetMatches
#   &GetHsps
#
# - data structure-oriented output
#   &BlastData
#   &BlastCrossHsp
#
# - list output
#   &BlastListQuery
#   &BlastListMatch
#   &BlastListId
#   &BlastListHsp
#   &BlastListScore
#
# - annotation and masking
#   &BlastSeqHsp
#   &BlastSeqAnnot
#   &BlastSeqMask
#
################################################################################
#
#  DEBUG, CHANGES, ADDITIONS
#
# - organize table output handle with global variable $ProgParam{handle}{tab}
#   like in SeqHandle.pl.
#
# - BUG statement for usage description of mode switch -MaskAnnot=S:
#     this does not work in the cyclic procedure of -SeqAnnot
#     and -SeqMask. See &SeqQueue for details.
#   I don't understand this BUG description myself anymore.
#
# - instead of testing ref($SeqFFmtOutFunc{$ProgOpt{-OutSeqFmt}}),
#   establish &SeqLab::SeqStreamOut::SupportsSeqFFmt
#
# - look also for notes in the header of each function block
#
################################################################################


# global constants and variables
# also used by modules as a unique global data anchor
our %GlobStore;

# include path(s), includes
use strict; #use warnings;  # OK 20130425
use Cwd qw(realpath);
BEGIN {
  unshift @INC, grep{$_} split(/:+/,$ENV{KPERLPATH}||$ENV{PERLPATH}||'');
  my ($mypath) = realpath(__FILE__)=~m/(^.*)\//;
  push @INC, $mypath;
}
use FileHandle;
use File::Basename qw(&basename);
use MainLib::StrRegexp qw(&TimeStr);
use MainLib::Data;
use MainLib::Path;
use MainLib::Cmdline qw(&GetoptsNArgs &QueryConfirm);
use MainLib::File;
use MainLib::FileTmp qw(&PathUnique);
use MainLib::Misc;
use Math::kCalc;
use Math::Range;
use database::DbPlain;
use SeqLab::SeqBench;
use SeqLab::SeqFormat;
use SeqLab::SeqStreamIn;
use SeqLab::SeqStreamOut;
use SeqLab::Blast;


# script ID
# - program name as specified on the command line
our $ProgFile = ( split('/',__FILE__) )[-1];
our $ProgFstump=$ProgFile; $ProgFstump=~s/\.\w{1,4}$//;

# global constants (esp. default values)
our %ProgParam;
$ProgParam{MaskChar} = undef;
$ProgParam{MaskLen} = 8;

$ProgParam{default}{db} = $BlastDefault{DbDictyAll};
$ProgParam{default}{DbAnnot} = $BlastDefault{DbDictyMask};
$ProgParam{default}{FilterOlap} = 'score,RelId,QueryLen';
  # sorting order in overlap filtering (OptionalSwitch -FilterOlap)
$ProgParam{default}{MaskPoly} = 12;
$ProgParam{default}{OutSeqFmt} = 'fastA';
$ProgParam{default}{ProgMode} = 'plain';
$ProgParam{default}{ThreshId}  = 0.915;
$ProgParam{default}{ThreshLen} = 0;
$ProgParam{default}{AnnotLabel} = 'REPT';

# working desk
$ProgParam{store} = undef;


# manage I/O #####################################

# organiz I/O handles
&Unbuffer();

# organiz tmp files
# - encapsulation in a sub{} allows MainLib::FileTmp to identify the calling
#   package
my $pcFT = sub{ $ProgParam{TmpManag} = MainLib::FileTmp->new(-suffix=>''); };
&$pcFT;


# command line interface #########################
# NOTE:
# - &GetoptsNArgs (via &AddSwitch) modifies global variables:
#   $ProgMode @ProgArg %ProgOpt
# - @ProgArg may be pre-filled in &GetoptsNArgs via &AddSwitch (option -fofn)

# arguments, switches, default subprogram
our $ProgMode = undef;
our @ProgArg = ();
our %ProgOpt = ();
unshift @ProgArg, &GetoptsNArgs();
$ProgMode ||= $ProgParam{default}{ProgMode};

# eventually open LOG file
if ($ProgOpt{-log}) {
  $ProgOpt{LogFile} = ($ProgOpt{-log} ne '1')? $ProgOpt{-log} : undef;
  &LogOpen (-file=>$ProgOpt{LogFile}, -stamp=>$ProgFstump, -prog=>"$ProgFile -$ProgMode");
}
END {
  $ProgOpt{-log} and &LogClose();
}


# work flow manifold #############################

# chain to program mode without input argument(s)
if (0) { }
elsif (!@ARGV or $ProgMode=~m/^h(elp)?$/i) { &usage() }
elsif ($ProgMode =~ m/^ListParam$/i) {
  &ProgListParam();
  exit 0;
}

# ensure input argument(s)
my $arg;
unless (@ProgArg) {
  die "ERROR: input arguments missing\n";
}
# validate input argument(s)
foreach $arg (@ProgArg) {
  unless ($arg eq '-' or -s &PathExpand($arg)) {
    printf STDERR "WARNING: input file %s does not exist or has zero length\n", $arg||"''";
  }
}

# initialize FileTmp manager
if ($ProgOpt{-debug}) {
  $ProgParam{TmpManag}->AddSwitch(-preserve=>1,-debug=>($ProgOpt{-debug}>1)?1:undef);
}

# initialize sequence queue
if (! $ProgOpt{-parse}) {
  $ProgParam{store}{queue}{SeqIn} = SeqLab::SeqStreamIn->new(@ProgArg);
  unless ($ProgParam{store}{queue}{SeqIn}) {
    die "ERROR: unable to initialize sequence input queue\n";
  }
  $ProgParam{store}{queue}{SeqIn}->AddSwitch (
    -fast     => ($ProgMode !~ m/seq/i),
    -MatchID  => $ProgOpt{-MatchID},
    -SlcDescr => $ProgOpt{-SlcDescr},
    -SlcEnds  => $ProgOpt{-SlcEnds},
    -SlcID    => $ProgOpt{-SlcID},
    -SlcLen   => $ProgOpt{-SlcLen},
    -debug    => $ProgOpt{-debug} ? $ProgOpt{-debug}-1 : 0,
    );
}

# chain to program mode (with input argument(s))
if (0) { }
elsif ($ProgMode =~ m/^CrossHSP$/i) {
  &BlastCrossHsp();
}
elsif ($ProgMode =~ m/^data$/i) {
  while ($arg = &GetBlast()) {
    &BlastData ($arg);
  }
}
elsif ($ProgMode =~ m/^html$/i) {
  if ($ProgOpt{-parse}) { &ErrorParse() }
  while ($arg = &SeqQueue()) {
    &BlastHtml ($arg);
  }
}
elsif ($ProgMode =~ m/^List(HSP)?$/i) {
  while ($arg = &GetBlast()) {
    &BlastListHsp ($arg);
  }
}
elsif ($ProgMode =~ m/^ListID(sh)?$/i) {
  $ProgParam{store}{queue}{SeqOut} = SeqLab::SeqStreamOut->new(
      $ProgOpt{-OutDir} ?
    (-dir    => $ProgOpt{-OutDir}) : (),
     -format => 'ID',
      $ProgOpt{-OutIdFmt} ?
    (-IdFormat => $ProgOpt{-OutIdFmt}) : (),
      $ProgOpt{-debug} ?
    (-debug  => $ProgOpt{-debug}-1) : (),
    );
  while ($arg = &GetBlast()) {
    &BlastListId ($arg);
  }
}
elsif ($ProgMode =~ m/^ListMatch$/i) {
  while ($arg = &GetBlast()) {
    &BlastListMatch ($arg);
  }
}
elsif ($ProgMode =~ m/^ListQuery$/i) {
  my $hOutTab = $ProgOpt{OutTabHandle} ||= \*STDOUT;
  print  $hOutTab "# $ProgFile -$ProgMode\n";
  printf $hOutTab "# date/time: %s\n", &TimeStr();
  printf $hOutTab "# query file%s: %s\n", (@ProgArg==1)?'':'s', join (', ', @ProgArg);
  printf $hOutTab "#\n# column labels:\n# %s\n", join ("\t",
    'QueryID',
    'CpuTime[s]',
    'NmbMatch',
    'HighestScore',
    'HighestId',
    'Matches',
    );
  while ($arg = &GetBlast()) {
    &BlastListQuery ($arg);
  }
}
elsif ($ProgMode =~ m/^ListScore(=(\w+))?$/i) {
  $ProgMode = 'ListScore';
  $ProgOpt{-ListScore} = lc $2;
  while ($arg = &GetBlast()) {
    &BlastListScore ($arg);
  }
}
elsif ($ProgMode =~ m/^plain$/i) {
  if ($ProgOpt{-parse}) { &ErrorParse() }
  while ($arg = &SeqQueue()) {
    &BlastPlain ($arg);
  }
}
elsif ($ProgMode =~ m/^(SeqAnnot|tag)(=(\w+))?$/i) {
  $ProgMode = 'SeqAnnot';
  $ProgOpt{-SeqAnnot} = $3;
  if ($ProgOpt{-parse}) { &ErrorParse() }
  unless ($ProgOpt{-db}) {
    # AddSwitch performs proof scanning in the context of all entered arguments
    &AddSwitch ('db='.$ProgParam{default}{DbAnnot});
  }
  $ProgParam{store}{queue}{SeqOut} = SeqLab::SeqStreamOut->new(
      $ProgOpt{-OutDir} ?
    (-dir    => $ProgOpt{-OutDir}) : (),
     -file   => $ProgOpt{-OutSeq} || 'rewrite',
     -format => $ProgOpt{-OutSeqFmt} || 'Experiment',
      $ProgOpt{-OutIdFmt} ?
    (-IdFormat => $ProgOpt{-OutIdFmt}) : (),
      $ProgOpt{-debug} ?
    (-debug  => $ProgOpt{-debug}-1) : (),
    );
  while ($arg = &GetBlast()) {
    &BlastSeqAnnot ($arg);
  }
}
elsif ($ProgMode =~ m/^SeqHsp$/i) {
  $ProgOpt{-IdFormat} = 'acc2';
    # do not apply $ProgOpt{-OutIdFmt} through SeqLab::SeqStreamOut, because
    # the routine will independently add an HSP suffix to the shortest possible
    # identifier
  if ($ProgOpt{-parse}) { &ErrorParse() }
  $ProgParam{store}{queue}{SeqOut} = SeqLab::SeqStreamOut->new(
      $ProgOpt{-OutDir} ?
    (-dir    => $ProgOpt{-OutDir}) : (),
      $ProgOpt{-OutSeq} ?
    (-file   => $ProgOpt{-OutSeq}) : (),
      $ProgOpt{-OutSeqFmt} ?
    (-format => $ProgOpt{-OutSeqFmt}) : (),
     -IdFormat => undef,
        # identifier re-formatting is done independently in &BlastSeqHsp
      $ProgOpt{-debug} ?
    (-debug  => $ProgOpt{-debug}-1) : (),
    );
  while ($arg = &GetBlast()) {
    &BlastSeqHsp ($arg);
  }
}
elsif ($ProgMode =~ m/^(SeqMask|MaskSeq)$/i) {
  $ProgMode = 'SeqMask';
  if ($ProgOpt{-parse}) { &ErrorParse() }
  unless ($ProgOpt{-db}) {
    # AddSwitch performs proof scanning in the context of all entered arguments
    &AddSwitch ('db='.$ProgParam{default}{DbAnnot});
  }
  $ProgParam{store}{queue}{SeqOut} = SeqLab::SeqStreamOut->new(
      $ProgOpt{-OutDir} ?
    (-dir    => $ProgOpt{-OutDir}) : (),
      $ProgOpt{-OutSeq} ?
    (-file   => $ProgOpt{-OutSeq}) : (),
      $ProgOpt{-OutSeqFmt} ?
    (-format => $ProgOpt{-OutSeqFmt}) : (),
      $ProgOpt{-OutIdFmt} ?
    (-IdFormat => $ProgOpt{-OutIdFmt}) : (),
      $ProgOpt{-debug} ?
    (-debug  => $ProgOpt{-debug}-1) : (),
    );
  while ($arg = &GetBlast()) {
    &BlastSeqMask ($arg);
  }
}
else {
  die "ERROR: unknown program mode or switch '$ProgMode'\n";
}

# exit script successfully
# cf. END blocks!
exit 0;


################################################################################
# usage help, command line arguments
################################################################################


sub usage {
  print "\n";
  print <<END_USAGE;
DESCRIPTION
 $ProgFile is used for running BLAST alignment searches with advanced
 report functions.

COMMAND LINE SYNTAX
 $ProgFile  -<ModeSwitch> [-<OptionalSwitch> ...] <Arg1> [<Arg2> ...]

Arguments
---------
 Typically, args specify sequence input files. Deviations are stated in the
 listing of ModeSwitches below.

path arguments:
 Path "-" resolves to STDIN or STDOUT, depending on the context.

Input File Format Support
-------------------------
 Input sequence files may have any of the following formats: Experiment file,
 fastA, plain sequence. Multi-sequence files are supported.
 BLAST report files may origin from any of these programs: WU-BLAST 2.0,
 blastall 2.1.

Output
------
 Basically, two different types of output may be generated by the program:

 sequence   a modified version of the source sequences, e.g. in program modes
            -SeqAnnot, -SeqMask. For sequence output, default output direction
            is STDOUT, but redirection is possible with switch -OutSeq, and
            file output may be influenced additionally by switch -OutDir.
 report     BLAST result in original (program mode -plain) or enriched
            text format (-html), or in tabular format (program modes -List*).
            Basically, BLAST reports, each corresponding to a single query
            sequence, are output separately to single files. The output file
            names are derived from the input file names plus sequence ID
            (Acc.No., if possible) plus a suffix that reflects the report
            format (see switch -OutSuffix for details). Construction of output
            filenames can be influenced by switches -OutDir, -OutStump,
            -OutSuffix.
            Alternatively, output may be bundled and redirected with switch
            -OutTab. The file format of tabular output can be chosen via
            switch -OutTabFmt, default: TAB-delimited.
            An exception to the each-report-to-single-file rule is program
            mode -ListQuery where the default output destination is STDOUT.

ModeSwitch (case-insensitive)
-----------------------------
switch argument types: B:=boolean, F:=floating point/scientific, N:=integer,
S:=string, X:=varying type.

<none>            default ModeSwitch -$ProgParam{default}{ProgMode} if program arguments are given.
                  Otherwise like ModeSwitch -help.
-crossHSP         construct graph that links query sequences to matching
                  sequences. The graph data structure is output in plain format.
                  A documentation of the HSP data structure can be found in
                  package SeqLab/Blast.pm, section "parsed BLAST report".
-data             output BLAST result as a data structure
-h(elp)           output command line syntax description and exit
-html             WWW-oriented output consisting of <InputName>.html and
                  <InputName>.png
                  -OutSuffix  no effect here
                  -OutTab     no effect here
-List(HSP)        list reported HSPs in table
                  -OutTabFmt  gff, table (default)
-ListID           list IDs of all matching sequences
-ListIDsh         like -ListID, but GenBank / GenPept entry numbers
                  (GIDs) displayed as bare numbers.
-ListMatch        list reported matching sequences in table
                  -OutTabFmt  gff, table (default)
-ListParam        list available BLAST parameter set definitions
                  args        NONE
-ListQuery        list BLAST results in one query per line
-ListScore(=S)    plot of maximum local HSP score
                  S           recalculate score: '' -> not (default), log10
-plain            normal BLAST producing plain text report.
                  -OutTab     concatenate reports and direct output to file
                  -OutTabFmt  no effect here
-SeqAnnot(=S)     turn hits (HSPs) into annotations
                  S           label of introduced annotations, default:
                              $ProgParam{default}{AnnotLabel}.
                              Existing annotations having this label will be
                              deleted.
                  -db         default: $ProgParam{default}{DbAnnot}
                  -OutSeq     default: 'rewrite'
                  -OutSeqFmt  default: 'Experiment'
                  -ValHspmax  only initializes hspmax overflow management
-SeqHSP           output matching sequence ranges from BLAST database. This
                  will only work with indexed WashU BLAST databases v2.0.
                  -HitSurrd=N add flanks of hit sequence ranges.
                  -OutIdFmt   identifier formatting is fixed to "acc2", here.
-SeqMask          mask query sequence according to the BLAST matches. Any
                  non-sequence-coding letters/characters in the sequence string
                  will be removed during this procedure.
                  -db         default: $ProgParam{default}{DbAnnot}
                  -ValHspmax  only initializes hspmax overflow management

OptionalSwitch (case-insensitive)
---------------------------------
switch argument types: B:=boolean, F:=floating point/scientific, N:=integer,
S:=string, X:=varying type.

-db=S             use specified BLAST database (standard database path is
                  $ENV{BLASTDB}). There are database defaults for program
                  modes -SeqMask and -SeqAnnot.
-debug(=N)        print debug protocol to STDERR (sometimes STDOUT). Keep
                  temporary files.
                  N           debug depth value
-extern           adapt HTML output (ModeSwitch -html) to external access
                  WWW environment: hide file paths etc.
-FilterOlap(=S)   filter HSPs/matches overlapping on the query sequence
                  S           order of scoring criteria to find the best
                              local match, default: $ProgParam{default}{FilterOlap}
-FilterSelf       filter self-matches that will occur if the query sequence
                  is part of the BLAST database
-fofn=S           supply list of command arguments in a file. The entries of the
                  file will be appended to the argument list. Multiple -fofn
                  switch statements are allowed.
-log(=S)          redirect STDOUT and STDERR to LOG file
                  S           log file path, default path worked out
                              automatically from built-in directives.
-HitSurrd=N       flanks of hit ranges, size N, default: 0
-MaskAnnot=S      mask tagged ranges in the query sequence.
                  S           comma-separated list of annotation labels that
                              are meant to be masked
                  BUG: this does not work in the cyclic procedure of -SeqAnnot
                       and -SeqMask. See &SeqQueue documentation for details.
-MaskPoly(=N)     mask poly(X) stretches in the query sequence.
                  N           minimum length of poly(X) stretch, default: $ProgParam{default}{MaskPoly}
-MatchID=S        specify a file containing a list of identifiers that shall be
                  used for sequence entry selection.
-OutColAdd=S      add columns to table output. Note that the list of default
                  report columns is function specific. The option cannot affect
                  GFF format output.
                  S           comma-separated list of column labels that may be
                              one of: align => sequence alignments
-OutColHide=S     hide columns in table output. Note that the list of default
                  report columns is function specific. The option cannot affect
                  GFF format output.
                  S           comma-separated list of column labels
-OutDir=S         directory path for results output. This switch overrides any
                  directory statement provided with switch -OutStump.
-OutIdFmt=S       format of sequence identifiers in tabular output, case-
                  insensitive option argument:
                  ""          (default:) as in BLAST database entries and query
                              sequence files
                  acc         try to extract bare Acc.Nos. from complex identi-
                              fier fields
                  acc2        try to extract Acc.Nos. from complex identifier
                              fields, including database prefix
                  acc3        try to extract Acc.Nos. from complex identifier
                              fields, including database prefix and version
                              suffix
                  gi          try to extract gi numbers from complex identifier
                              fields
                  gidel       delete gi number component from complex identifier
                              fields
                  word        try to extract bare Acc.No. or first word from
                              complex identifier field. The danger is that
                              sequence identifiers turn non-unique.
-OutSeq=S         file path for sequence output
-OutSeq=rewrite   preserve the file structure as in sequence input.
                  Combination with other switches:
                  -OutDir     rewrite sequences to specified directory
-OutSeq=SingleSeq write single-sequence output with filenames made unique by
                  adding sequence ID to the source filename.
-OutSeqFmt=S      format for sequence output, case-sensitive:
                  Experiment  Staden Experiment file.
                  fastA       fastA or Pearson format (default)
                  GFF         GFF
                  plain       sequence output in condensed plain text format.
                              Line feed every 60 characters.
                  table       TAB-delimited table format
-OutStump=S       path stump for multi-file output. A default is derived from
                  input file names.
-OutSuffix=S      specify a file suffix for the BLAST output, default: '.bl'
                  for plain or '.bll' for list output. This switch has no
                  effect in program mode -html.
-OutSuffixCat     suppresses the removal of the file suffix of the input
                  file while constructing a name for an output file
-OutTab=S         redirect report output to single file
-OutTabFmt=S      format for table output, case-insensitive:
                  GFF         GFF format
                  table       TAB-delimited (default)
                  This switch will not affect the name suffix of output files.
                  Use -OutSuffix for that.
-param=S          string S specifies a parameter set from internal parameter
                  library. Cmp. ModeSwitch -ListParam.
-parse            arguments refer to BLAST report files. Do not invoke BLAST
                  processes.
-pid=S            output pid to file S
-program=S        use specified BLAST program.
-SlcDescr=S       specify an explicit RegExp or a list of sequence
                  description keywords (RegExps in a file) which shall be
                  used to select entries from the sequence source.
-SlcEnds=N        select sequence ends having the specified length N bp/aa. A
                  sequence smaller than two times this value will be loaded
                  in complete length. A switch argument value lower than 1
                  will be ignored.
-SlcID=S          specify a regexp that shall be used for sequence ID selection
-SlcKnown=N       select input sequences which have an at least N bp spanning
                  continuous non-masked sequence range. Masked ranges are
                  defined by poly(N) / poly(X) having a length of >=$ProgParam{MaskLen} bp.
                  However, the input sequences themselves are unchanged.
-SlcLen=N1(..N2)  select input sequences according to their length
                  N1          minimum length
                  N2          maximum length, default: no limit
-ThreshCplx=F     specify an alignment complexity threshold, default: NONE.
-ThreshId=F       specify a relative identity threshold, default for program
                  modes -SeqAnnot and -SeqMask: $ProgParam{default}{ThreshId}, for other modes: NONE.
-ThreshLen=N      specify an HSP length threshold, default for program modes
                  -SeqAnnot and -SeqMask: $ProgParam{default}{ThreshLen}, for other modes: NONE.
-Val...           set BLAST-specific parameters. First-hand description at.
                  http://blast.wustl.edu/. Note: The intention of this progrm
                  is not to relieve you of the work to think about BLAST
                  parameters.
-ValB=N           explicitly set BLAST parameter B, cf. BLAST documentation.
                  BLAST parameter V is set equal to parameter B.
-ValCpus=N        explicitly set maximum number of CPUs used, cf. BLAST
                  documentation
-ValE=F           explicitly set BLAST parameter E to F, which is the expect
                  threshold with 0 < F < 1.
-ValGapW=N        explicitly set BLAST parameter gapW, cf. BLAST documentation
-ValGapX=N        explicitly set BLAST parameter gapX, cf. BLAST documentation
-ValHspmax=N      explicitly set BLAST parameter hspmax, cf. BLAST
                  documentation
-ValM=N           explicitly set BLAST parameter M, cf. BLAST documentation
-ValMatrix=S      explicitly set BLAST parameter matrix, cf. BLAST
                  documentation
-ValN=-N          explicitly set BLAST parameter N, cf. BLAST documentation
-ValNoGap         turn off gapped alignments
-ValQ=N           explicitly set BLAST parameter Q, cf. BLAST documentation
-ValR=N           explicitly set BLAST parameter R, cf. BLAST documentation
-ValS=N           explicitly set BLAST parameter S, cf. BLAST documentation
-ValS2=N          explicitly set BLAST parameter S2, cf. BLAST documentation
-ValT=N           explicitly set BLAST parameter T, cf. BLAST documentation
                  what is the default value?
-ValV=N           alias to -ValB
-ValW=N           explicitly set BLAST parameter W, cf. BLAST documentation
-ValX=F           explicitly set BLAST parameter X, cf. BLAST documentation
                  what is the default value?

Environment Variables
---------------------
 \$KPERLPATH       primary search path for Perl package look-up
 \$TEMPPATH        directory for storage of temporary files, default /tmp

Temporary Files
---------------
 The program may generate temporary files. These will be placed either in a
 directory specified by \$ENV{TEMPPATH} or in /tmp.
END_USAGE
  print "\n";
  exit 1;
}


# add program switches to global table
#
# INTERFACE
# - argument 1:  switch argument without leading '-'
#
# - global options:
#   -debug       [STD]
#
# - global data:
#   $ProgMode
#   @ProgArg     may be pre-filled here beside it's typically used to store
#                return value from &GetoptsNArgs via &AddSwitch (option
#                -fofn)
#   %ProgOpt  switch data which gets processed here
#
# DESCRIPTION
# - this function gets called by &MainLib::Misc::GetoptsNArgs
# - switch arguments are tested for validity. Arguments are parsed with highest
#   possible tolerance. This way, syntax errors can reported in accordance to
#   the actual switch, rather than reporting ANY syntax error.
#
sub AddSwitch {
  my $copt = shift;
  my $debug = $ProgOpt{-debug};

  # optional switches
  my ($SwitchArg);
  if ($copt =~ m/^db=(.+)$/i) {
    $ProgOpt{-db} = $1;
    if ($ProgOpt{-db} =~ m|/|) {
      $ProgOpt{-db} = &PathExpand ($ProgOpt{-db});
    } else {
      $ProgOpt{-db} = $ENV{BLASTDB} .'/'. $ProgOpt{-db};
    }
    # validity of this arg is tested in &SeqLab::Blast::BlastParamStr
    return;
  }
  if ($copt =~ m/^debug(=(\d+))?$/i) {
    $ProgOpt{-debug} = defined($2) ? int($2) : 1;
    return;
  }
  if ($copt =~ m/^extern$/i) {
    $ProgOpt{-extern} = 1;
    return;
  }
  if ($copt =~ m/^FilterOlap(=(\S+))?$/i) {
    $ProgOpt{-FilterOlap}
      = [ split (/,/, $2 || $ProgParam{default}{FilterOlap}) ];
    return;
  }
  if ($copt =~ m/^FilterSelf(=1)?$/i) {
    $ProgOpt{-FilterSelf} = 1;
    return;
  }
  if ($copt =~ m/^fofn=(.+)$/i) {
    $SwitchArg = ($1 eq '-') ? $1 : &PathExpand($1);
    if (my $pTable=&LoadFoid($SwitchArg)) {
      push @ProgArg, @$pTable;
    } else {
      die sprintf "ERROR: unable to read entries from file of filenames %s (-> %s)\n",
        $1, $SwitchArg;
    }
    return;
  }
  if ($copt =~ m/^log(=(.*))?$/i) {
    $ProgOpt{-log} = $2 ? &PathExpand($2) : 1;
    return;
  }
  if ($copt =~ m/^HitSurrd=(.+)$/i) {
    $ProgOpt{-HitSurrd} = int ($1);
    return;
  }
  if ($copt =~ m/^parse$/i) {
    $ProgOpt{-parse} = 1;
    return;
  }
  if ($copt =~ m/^Mask(Annot|Tag)=(.+)$/i) {
    $ProgOpt{-MaskAnnot} = [ split (/,/, $2) ];
    return;
  }
  if ($copt =~ m/^MaskPoly(=(\d+))?$/i) {
    $ProgOpt{-MaskPoly} = $2 || $ProgParam{default}{MaskPoly};
    return;
  }
  if ($copt =~ m/^MatchID=(.+)$/i) {
    $SwitchArg = (
      grep { ($_ eq '-') or (-r $_ and ! -d $_ and -s $_) }
      &PathExpand($1), $1 )[0];
    if ($SwitchArg and my $pSlc=&LoadFoid($SwitchArg)) {
      $ProgOpt{-debug} and printf STDERR "read %d entr%s from file of ID selectors: %s\n",
        int(@$pSlc), (@$pSlc==1)?'y':'ies', $SwitchArg||"''";
      $ProgOpt{-MatchID} = $pSlc;
    } else {
      die "ERROR: unable to read file $1 (option -MatchID)\n";
    }
    return;
  }
  if ($copt =~ m/^OutColAdd=(.+)$/i) {
    push @{$ProgOpt{-OutColAdd}}, split(/,/,$1);
    return;
  }
  if ($copt =~ m/^OutColHide=(.+)$/i) {
    push @{$ProgOpt{-OutColHide}}, split(/,/,$1);
    return;
  }
  if ($copt =~ m/^OutDir=(.+)$/i) {
    $ProgOpt{-OutDir} = &PathExpand ($1);
    unless (-d $ProgOpt{-OutDir}) {
      if (-e $ProgOpt{-OutDir}) {
        die sprintf "ERROR: output destination exists, but is not a directory: %s\n", $ProgOpt{-OutDir}||"''";
      }

      # this dialogue will also appear if there's no output do be done to any
      # file/directory
      else {
        if (int(grep{ $_ eq '-' }@ARGV)) {
          printf STDERR "creating non-existent output directory (skip dialogue in STDIN input mode)\n";
          mkdir ($ProgOpt{-OutDir});
        } else {
          printf STDERR "output directory does not exist, create?";
          if (&QueryConfirm()) {
            mkdir ($ProgOpt{-OutDir});
          } else { exit 1 }
        }
      }
    }
    return;
  }
  if ($copt =~ m/^OutIdFmt=([\w-]+)$/i) {
    $ProgOpt{-OutIdFmt} = ( grep { lc($1) eq lc($_) }
      grep { ref($SeqidFunc{$_}) eq 'CODE' } keys(%SeqidFunc) )[0];
    unless ($ProgOpt{-OutIdFmt}) {
      die sprintf "ERROR: specified identifier format action %s (opton -OutIdFmt) is not available\n", $1;
    }
    return;
  }
  if ($copt =~ m/^OutSeq=(.+)$/i) {
    $ProgOpt{-OutSeq} =
      ($1 eq '-' or int(grep {$_ eq $1} @{$SeqLab::SeqStreamOut::LibGlob{FileMagic}})) ?
      $1 : &PathExpand($1);
    return;
  }
  if ($copt =~ m/^OutSeqFmt=(\w+)$/i) {
    $ProgOpt{-OutSeqFmt} = ( grep { lc($1) eq lc($_) }
      grep { ref($SeqFFmtOutFunc{$_}) eq 'CODE' } keys(%SeqFFmtOutFunc) )[0];
    unless ($ProgOpt{-OutSeqFmt}) {
      die sprintf "ERROR: specified sequence output format %s (opton -OutSeqFmt) is not available\n", $1;
    }
    $debug and printf STDERR "%s. set seq output format to %s\n", &MySub, $ProgOpt{-OutSeqFmt};
    return;
  }
  if ($copt =~ m/^OutSt[au]mp=(.+)$/i) {
    $ProgOpt{-OutStump} = $1;
    return;
  }
  if ($copt =~ m/^(Out)?Suffix=(.*)$/i) {
    $ProgOpt{-OutSuffix} = $2;
    # ensure '.' at the beginning
    if ($ProgOpt{-OutSuffix} !~ m/^\./) {
      $ProgOpt{-OutSuffix} = '.'. $ProgOpt{-OutSuffix};
    }
    return;
  }
  if ($copt =~ m/^OutSuffixCat$/i) {
    $ProgOpt{-OutSuffixCat} = 1;
    return;
  }
  if ($copt =~ m/^OutTab=(.+)$/i) {
    $ProgOpt{-OutTab} = ($1 eq '-') ? $1 : &PathExpand($1);
    $ProgOpt{OutTabHandle} = ($1 eq '-') ? \*STDOUT :
      FileHandle->new($ProgOpt{-OutTab},'w');
    unless ($ProgOpt{OutTabHandle}) {
      die sprintf "ERROR: unable to write output file %s\n", $ProgOpt{-OutTab}||"''";
    }
    return;
  }
  if ($copt =~ m/^OutTabFmt=(\w+)$/i) {
    $ProgOpt{-OutTabFmt} = lc $1;
    if ($ProgOpt{-OutTabFmt} !~ m/^(gff|html|tab(le)?)$/) {
      die sprintf "ERROR: specified table output format %s is not available\n", $ProgOpt{-OutTabFmt}||"''";
    }
    return;
  }
  if ($copt =~ m/^param=(.+)$/i) {
    unless (defined $SeqLab::Blast::LibGlob{ParamSet}{nucleotide}{$1} or
            defined $SeqLab::Blast::LibGlob{ParamSet}{protein}{$1}
      ) {
      printf STDERR "ERROR: BLAST parameter set %s is not defined\n", $1;
      print  STDERR "specify any parameter set from internal library:\n";
      foreach ('nucleotide', 'protein') {
      printf STDERR "  %s comparison: %s\n", $_, join (', ', sort keys %{$SeqLab::Blast::LibGlob{ParamSet}{$_}});
      }
      exit 1;
    }
    $ProgOpt{-param} = $1;
    return;
  }
  if ($copt =~ m/^pid=(.+)$/i) {
    $SwitchArg = ($1 eq '-') ? $1 : &PathExpand($1);
    &WriteFile ($SwitchArg, "$$\n");
    return;
  }
  if ($copt =~ m/^program=([\w-]+)$/i) {
    $ProgOpt{-program} = $1;
    if ($ProgOpt{-program} !~ m/blast/i) {
      die sprintf "ERROR: specified BLAST program %s not available\n",
        $ProgOpt{-program} ? $ProgOpt{-program} : "''";
    }
    return;
  }
  if ($copt =~ m/^Select/i) {
    die "ERROR: the -Select* switches are now spelled -Slc*\n";
  }
  if ($copt =~ m/^SlcDescr=(.+)$/i) {
    $ProgOpt{-SlcDescr} = $1;
    return;
  }
  if ($copt =~ m/^SlcEnds=(-?\d+)$/i) {
    $ProgOpt{-SlcEnds} = int ($1);
    if ($ProgOpt{-SlcEnds} <= 0) {
      printf STDERR "WARNING: use option syntax -SlcEnds=N with N > 0. N=%d does not make sense!\n", $1;
      delete $ProgOpt{-SlcEnds};
    }
    return;
  }
  if ($copt =~ m/^SlcID=(.+)$/i) {
    $ProgOpt{-SlcID} = $1;
    return;
  }
  if ($copt =~ m/^SlcKnown=(\d+)$/i) {
    $ProgOpt{-SlcKnown} = $1;
    $ProgOpt{-SlcLen} ||= Math::Range->new($1,undef);
    return;
  }
  if ($copt =~ m/^SlcLen=(.+)$/i) {
    unless ($ProgOpt{-SlcLen} = Math::Range->new_parsed($1)) {
      die "ERROR: invalid argument for switch -SlcLen: $1\n";
    }
    return;
  }
  if ($copt =~ m/^ThreshCplx=([\d\.]+)$/i) {
    $ProgOpt{-ThreshCplx} = $1;
    if ($ProgOpt{-ThreshCplx} < 0) {
      die "ERROR: specify complexity threshold in the range: ThreshCplx > 0\n";
    }
    return;
  }
  if ($copt =~ m/^ThreshId=([\d\.]+)$/i) {
    $ProgOpt{-ThreshId} = $1;
    if ($ProgOpt{-ThreshId} < 0 or $ProgOpt{-ThreshId} > 1) {
      die "ERROR: specify identity threshold in the range: 0 < ThreshId < 1\n";
    }
    return;
  }
  if ($copt =~ m/^ThreshLen=(\d+)$/i) {
    $ProgOpt{-ThreshLen} = $1;
    return;
  }
  if ($copt =~ m/^ValB=(.+)$/i) {
    $ProgOpt{-ValB} = int ($1);
    if ($ProgOpt{-ValB} < 0) {
      die "ERROR: specify BLAST parameter in the range: B >= 0 (is $1)\n";
    }
    return;
  }
  if ($copt =~ m/^ValV=(.+)$/i) {
    $ProgOpt{-ValV} = int ($1);
    if ($ProgOpt{-ValV} < 0) {
      die "ERROR: specify BLAST parameter in the range: V >= 0 (is $1)\n";
    }
    return;
  }
  if ($copt =~ m/^ValCpus=(.+)$/i) {
    $ProgOpt{-Valcpus} = int ($1);
    if ($ProgOpt{-Valcpus} < 1) {
      die "ERROR: specify BLAST parameter cpus in the range: cpus > 0 (is $1)\n";
    }
    return;
  }
  if ($copt =~ m/^ValE=([e0-9\.+-]+)$/i) {
    $ProgOpt{-ValE} = $1;
    $ProgOpt{-ValE} =~ s/^(e.*)$/1$1/i;
    return;
  }
  if ($copt =~ m/^ValGapW=(.+)$/i) {
    $ProgOpt{-ValgapW} = int ($1);
    if ($ProgOpt{-ValgapW} < 1) {
      die "ERROR: specify BLAST parameter gapW in the range: gapW > 0 (is $1)\n";
    }
    return;
  }
  if ($copt =~ m/^ValGapX=(.+)$/i) {
    $ProgOpt{-ValgapX} = int ($1);
    if ($ProgOpt{-ValgapX} < 1) {
      die "ERROR: specify BLAST parameter gapX in the range: gapX > 0 (is $1)\n";
    }
    return;
  }
  if ($copt =~ m/^ValHspmax=(.+)$/i) {
    $ProgOpt{-Valhspmax} = int ($1);
    if ($ProgOpt{-Valhspmax} < 1) {
      die "ERROR: specify BLAST parameter hspmax in the range: hspmax > 0 (is $1)\n";
    }
    return;
  }
  if ($copt =~ m/^ValM=(.+)$/i) {
    $ProgOpt{-ValM} = int ($1);
    if ($ProgOpt{-ValM} < 1) {
      die "ERROR: specify BLAST parameter M in the range: M > 0 (is $1)\n";
    }
    return;
  }
  if ($copt =~ m/^ValMatrix=(.+)$/i) {
    $ProgOpt{-Valmatrix} = $1;
    if (! grep { -r $_ } ($ProgOpt{-Valmatrix} =~ m/\//) ?
       $ProgOpt{-Valmatrix} : glob("$ENV{BLASTMAT}/*/$ProgOpt{-Valmatrix}")
    ) {
      die sprintf "ERROR: unable to read scoring matrix file %s\n", $ProgOpt{-Valmatrix};
    }
    return;
  }
  if ($copt =~ m/^ValN=(.+)$/i) {
    $ProgOpt{-ValN} = int ($1);
    if ($ProgOpt{-ValN} > -1) {
      die "ERROR: specify BLAST parameter N in the range: N <= -1 (is $1)\n";
    }
    return;
  }
  if ($copt =~ m/^ValNoGap$/i) {
    $ProgOpt{-Valnogap} = 1;
    return;
  }
  if ($copt =~ m/^ValQ=(.+)$/i) {
    $ProgOpt{-ValQ} = int ($1);
    if ($ProgOpt{-ValQ} < 1) {
      die "ERROR: specify BLAST parameter Q in the range: Q > 0 (is $1)\n";
    }
    return;
  }
  if ($copt =~ m/^ValR=(.+)$/i) {
    $ProgOpt{-ValR} = int ($1);
    if ($ProgOpt{-ValR} < 1) {
      die "ERROR: specify BLAST parameter R in the range: R > 0 (is $1)\n";
    }
    return;
  }
  if ($copt =~ m/^ValS=(.+)$/i) {
    $ProgOpt{-ValS} = int ($1);
    if ($ProgOpt{-ValS} < 1) {
      die "ERROR: specify BLAST parameter S in the range: S > 0 (is $1)\n";
    }
    return;
  }
  if ($copt =~ m/^ValS2=(.+)$/i) {
    $ProgOpt{-ValS2} = int ($1);
    if ($ProgOpt{-ValS2} < 1) {
      die "ERROR: specify BLAST parameter S in the range: S > 0 (is $1)\n";
    }
    if ($ProgOpt{-ValS} and $ProgOpt{-ValS}<$ProgOpt{-ValS2}) {
      die "ERROR: BLAST parameter S2 needs to fulfil: S2 < S (is $1)\n";
    }
    return;
  }
  if ($copt =~ m/^ValT=(.+)$/i) {
    $ProgOpt{-ValT} = int ($1);
    if ($ProgOpt{-ValT} < 1) {
      die "ERROR: specify BLAST parameter T in the range: T > 0 (is $1)\n";
    }
    return;
  }
  if ($copt =~ m/^ValW=(.+)$/i) {
    $ProgOpt{-ValW} = int ($1);
    if ($ProgOpt{-ValW} < 1 or $ProgOpt{-ValW} > 50) {
      die "ERROR: specify BLAST parameter W in the range: 0 < W < 50 (is $1)\n";
    }
    return;
  }
  if ($copt =~ m/^ValX=([\d\.]+)$/i) {
    $ProgOpt{-ValX} = $1;
    if ($ProgOpt{-ValX} < 1) {
      die "ERROR: specify BLAST parameter X in the range: X > 0 (is $1)\n";
    }
    return;
  }

  # program mode switches
  if (defined($ProgMode)) {
    die sprintf "ERROR: multiple specification of program mode or unknown switch, %s and %s\n",
      '-'.($ProgMode||"''"), '-'.($copt||"''");
  }
  else {
    $ProgMode = $copt;
    return;
  }
}


# error exit: option -parse conflicts with program mode
#
# INTERFACE
# - global data:
#   $ProgMode
#
sub ErrorParse {
  die sprintf "ERROR: option -parse conflicts with program mode program mode \"%s\"\n",
    $ProgMode||'plain';
}


################################################################################
# basic I/O
################################################################################


# hand over current input file handle (BLAST report files)
#
# INTERFACE
# - global options:
#   -debug      [STD]
#
# - return val: - reference to file handle
#               - undef if queue is empty or an error occurs
#
# - global data:
#   $ProgParam{store}{queue}{ReportPath}
#   $ProgParam{store}{queue}{ReportHdl} 
#
sub HdlQueue {
  my $debug = $ProgOpt{-debug};

  # try current input file handle
  my $pHdl = $ProgParam{store}{queue}{ReportHdl};

  # turn to next file
  if (! $pHdl or $pHdl->eof()) {
    my $fn = shift @ProgArg;
    if (!$fn) { return undef }
    $debug and printf STDERR "%s. turning to input file %s\n", &MySub, $fn;
    $ProgParam{store}{queue}{ReportPath} = $fn;
    $ProgParam{store}{queue}{ReportHdl} = FileHandle->new($fn);
    return &HdlQueue();
  }

  return $pHdl;
}


# hand over next sequence data structure from sequence input queue
#
# INTERFACE
# - global options:
#   -debug      [STD]
#
# - global data:
#   $ProgParam{MaskChar}  derived from current seq type
#
# - return val: - reference to sequence array data structure
#               - undef if queue is empty or an error occurs
#
# DESCRIPTION
# - MAIN or the $ProgMode function has previously initialized class SeqStreamIn
#   with an array of path names.
# - all returned sequences are additionally saved to temporary single-sequence
#   files in fastA format, pathname stored in field 'SrcTmp', managed by
#   a MainLib::FileTmpGrp object that is anchored in field 'SrcTmpGrp'.
# - Always, a gap-free sequence string will be stored in a field labelled
#   'SeqPure'.
# - In case of $ProgOpt{-MaskPoly} set true, the masked sequence can be found
#   in field 'SeqPure' and is saved to the temporary file.
#
sub SeqQueue {
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;

  # get next sequence entry from sequence source queue
  my $pSeq = $ProgParam{store}{queue}{SeqIn}->GetNext(-debug=>$dbg2);
  $pSeq or return undef;
  $debug and printf STDERR "%s. preparing sequence %s\n", &MySub, $$pSeq{id}||"''";

  # determine sequence type
  # this is needed for determination/verification of the appropriate BLAST program/db
  my $SeqType = &SeqType ($$pSeq{sequence});
  $SeqType =~ s/^[DR]NA$/nucleotide/;
  $ProgOpt{-QueryType} = $SeqType;
  $debug and printf STDERR "%s. query sequence %s, type %s\n", &MySub,
    $$pSeq{id}||"''", $SeqType||"''";

  ##############################################################################
  # mask, purify sequence string
  my $SeqStrMasked;

  # masking character for current sequence
  if ($ProgOpt{-MaskPoly} or $ProgOpt{-MaskAnnot} or
      $ProgMode =~ m/^(Seq(Annot|Mask)|MaskSeq|tag)\b/ or
      $ProgOpt{-SlcKnown}) {
    $ProgParam{MaskChar} = $SeqSmbUnk{$SeqType};
    if ($ProgParam{MaskChar}) {
      $debug and printf STDERR "%s. query sequence %s, type %s, masking character %s\n", &MySub,
        $$pSeq{id}||"''", $SeqType||"''", $ProgParam{MaskChar}||"''";
    } else {
      die sprintf "%s. ERROR: sequence %s: unable to determine masking character for seq type %s,\n", &MySub,
        $$pSeq{id}||"''", $SeqType||"''";
    }
  }

  # mask annotated ranges in sequence
  # - this applies to the non-purified sequence string
  # - we need to know MaskChar
  if ($ProgOpt{-MaskAnnot}) {
    my $RegexpAnnot = join ('|', @{$ProgOpt{-MaskAnnot}});
    my @AnnotMask = grep { $_->{type}=~m/^($RegexpAnnot)$/o } @{$$pSeq{annot}};
    if (@AnnotMask) {
      $SeqStrMasked = $$pSeq{sequence};
      foreach my $pAnnot (@AnnotMask) {
        $debug and printf STDERR "%s. masking annotation range %d..%d\n", &MySub, $$pAnnot{offset}, $$pAnnot{end};
        substr ($SeqStrMasked, $$pAnnot{offset}-1, $$pAnnot{end}-$$pAnnot{offset}+1)
          =~ s/[$SeqSmb{all}]/$ProgParam{MaskChar}/gi;
      }
      $debug and printf STDERR "%s. query sequence masked according to annotations labelled: %s:\n%s", &MySub,
        join ('|', @{$ProgOpt{-MaskAnnot}}) || "''",
        &SeqentryToFasta ({id=>$$pSeq{id}, sequence=>$SeqStrMasked});
    }
  }

  # need pure copy of sequence
  # don't use option -pure in SeqStreamIn, purification is done only for
  #   masking and query output. The original sequence is assumed to be
  #   untouched as it may be used for later output.
  $debug and printf STDERR "%s. query sequence %s, enforcing pure seq string\n", &MySub, $$pSeq{id}||"''";
  $$pSeq{SeqPure} ||= &SeqStrPure ($SeqStrMasked||$$pSeq{sequence}, -upper=>1);

  # mask poly(X) stretches in pure sequence
  # - this applies to the purified sequence string
  # - we need to know MaskChar
  if ($ProgOpt{-MaskPoly}) {
    $debug and printf STDERR "%s. query sequence %s, masking poly(X) stretches, min. length %d\n", &MySub,
      $$pSeq{id}||"''", $ProgOpt{-MaskPoly};
    $$pSeq{SeqPure} = &MaskPoly ($$pSeq{SeqPure},
      -ThreshPoly => ($ProgOpt{-MaskPoly}>1) ? $ProgOpt{-MaskPoly} : undef,
      -SmbMask    => $ProgParam{MaskChar},
      -debug      => $dbg2);
  }

  ##############################################################################
  # select sequence range, write sequence

  # select by length of known sequence
  if ($ProgOpt{-SlcKnown}) {
    my $RegexpSplit = sprintf '%s{%d,}', $ProgParam{MaskChar}, $ProgParam{MaskLen};
    my $FragLen = &Max (map{ length($_) } split(/$RegexpSplit/,$$pSeq{SeqPure}));
    if ($FragLen < $ProgOpt{-SlcKnown}) {
      $debug and printf STDERR "%s. query sequence %s fails length filter %d (split %d x %s), fragment maximum = %d\n", &MySub,
        $$pSeq{id}||"''", $ProgOpt{-SlcKnown},
        $ProgParam{MaskLen}, $ProgParam{MaskChar},
        $FragLen;
      return &SeqQueue();
    }
  }

  # save sequence to single-sequence fastA file
  $$pSeq{SrcTmpGrp} = $ProgParam{TmpManag}->CreateGrp();
  $$pSeq{SrcTmp} = $$pSeq{SrcTmpGrp}->Create('.fa');
  &WriteFile ($$pSeq{SrcTmp}, &SeqentryToFasta($pSeq,-KeySeq=>'SeqPure',-pure=>1));
  $debug and printf STDERR "%s. saved fastA entry %s as separate query file %s\n", &MySub,
    $$pSeq{id}||"''", $$pSeq{SrcTmp}||"''";

  # return array
  return $pSeq;
}


# work out output path from source file and identifier information
#
# INTERFACE
# - argument 1: reference path
#               typically sequence source file or report file
# - argument 2: reference substring, typically sequence identifier
#
# - options:
#   -debug      [STD]
#   -overwrite  if the output file already exists, don't look for a unique
#               filename, default: do
#   -OutSuffix  use this filename suffix rather than '.bl'. This option
#               is overridden by $ProgOpt{-OutSuffix}
#
# - global options:
#   -OutDir     [STD]
#   -OutStump   [STD]
#   -OutSuffix  [STD]
#   -OutSuffixCat  [STD]
#
# - return val: - output path (file created with null-length and checked)
#               - undef if an error occurs
#
# - global data:
#   %{$ProgParam{store}{OutSub}}  look-up index to allow unique suffices
#                                 not implemented 20040227
#
sub PrepOutpath {
  my ($RefPath, $RefSub, %opt) = @_;
  my $debug = $opt{-debug};
  unless (defined($opt{-OutSuffix})) { $opt{-OutSuffix}='.bl' }
  my ($pPathField,$PathOut,$IdOut);

  # name stem
  if ($ProgOpt{-OutStump}) {
    $PathOut = $ProgOpt{-OutStump};
  } else {
    $pPathField = &PathSplit ($RefPath);
    $debug and printf STDERR "%s. source path: %s\n", &MySub, $RefPath||"''";
    if ($ProgOpt{-OutDir}) {
      $$pPathField{dir} = $ProgOpt{-OutDir};
    }
    $PathOut = $$pPathField{dir} .'/'. $$pPathField{name};
  }
  $debug and print  STDERR "output path, step A1: $PathOut\n";

  # append identifier in case of multi-sequence source
  if ($RefSub) {
    unless ($ProgOpt{-OutSuffixCat}) {
      $PathOut = &PathChgSuffix ($PathOut, '', -last=>1);
    }
    $IdOut = $RefSub;
    $IdOut =~ s/[^a-zA-Z0-9_.-]//g;  # \w in [] doesn't work here
    $PathOut .= '_'. $IdOut;
  } else {
    $PathOut =~ s/\.(fa|pln)$//;
  }
  $debug and print  STDERR "output path, step A2: $PathOut\n";

  # final refinement
  # - append file suffix
  # - expand to nice, rooted path
  $PathOut .= $ProgOpt{-OutSuffix} || $opt{-OutSuffix};
  $PathOut = &PathExpand ($PathOut);

  # generate unique file
  if ($opt{-overwrite}) {
    if (! -e $PathOut) { &touch($PathOut) } else { truncate($PathOut,0) }
  } else {
    $PathOut = &PathUnique (-name=>$PathOut, -touch=>1);
  }
  $debug and print  STDERR "output path, step 3: $PathOut\n";
  unless (-e $PathOut and -s($PathOut)==0) {
    printf STDERR "%s. ERROR: unable to write output file %s\n", &MySub,
      $PathOut||"''";
    return undef;
  }

  # return path
  return $PathOut;
}


# prepare table output header
#
# INTERFACE
# - argument 1: reference to BLAST result data structure
#
# - options:
#   -debug      [STD]
#
# - global options:
#   ...         for selection and filter options see &GetMatches / &GetHsps
#
# DESCRIPTION
# - query parameters have added to the result data structure secondarily in
#   &GetMatches / &GetHsps.
#
sub PrepTabHeader {
  my ($pBlParse) = @_;
  my $debug = $ProgOpt{-debug};

  # used output filtering
  my $RsltFlt = join (' ', map { "$_=$ProgOpt{$_}" }
    grep { m/^-(Filter|Thresh)/ and $ProgOpt{$_} } keys %ProgOpt);
  $RsltFlt ||= 'NONE';

  # construct header in comment
  my $TabHeader = '';
  $TabHeader .= "# $ProgFile -$ProgMode\n";
  $TabHeader .= sprintf "# date/time: %s\n", &TimeStr();
  $TabHeader .= sprintf "# query seq: %s, length %d\n",
    $$pBlParse{QuerySeq}{id}||"''", $$pBlParse{QuerySeq}{length};
  $TabHeader .= "# BLAST program, database: $$pBlParse{QueryProgAndDb}\n";
  $TabHeader .= "# BLAST parameters: $$pBlParse{QueryParamStr}\n";
  $TabHeader .= "# output filtering: $RsltFlt\n";
  $TabHeader .= sprintf "# total no. of matches in primary BLAST report: %d\n",
    $$pBlParse{MatchNum};

  # exit SUB
  return $TabHeader;
}


# list available parameter set definitions
#
sub ProgListParam {

  # sequence type categories
  foreach my $SeqType (qw(nucleotide protein)) {
    printf "\n%s\n%s\n\n", uc($SeqType), '=' x length($SeqType);

    # loop over parameter sets
    foreach my $ParamKey (sort keys %{$SeqLab::Blast::LibGlob{ParamSet}{$SeqType}}) {
      print  "$ParamKey:\n";
      print  map { sprintf(" %s  %s\n",$_,$SeqLab::Blast::LibGlob{ParamSet}{$SeqType}{$ParamKey}{$_}) }
        sort keys %{$SeqLab::Blast::LibGlob{ParamSet}{$SeqType}{$ParamKey}};
      print  "\n";
    }
  }
}


################################################################################
# classical report
################################################################################


# do BLAST, produce text output
#
# INTERFACE
# - argument 1: reference to sequence data structure (query seq)
#
# - global options:
#   ...         for BLAST parameter options see &SeqLab::Blast::BlastParamStr
#   ...         for global file output options see &PrepOutpath
#   -debug      [STD], leave temporary files
#   -OutTab     [STD]
#
# DESCRIPTION
# - At this stage of development nothing is parsed from the BLAST report.
#   The report's just checked for errors and repeated in some cases.
#
sub BlastPlain {
  my ($pSeq) = @_;
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;

  # create BLAST call
  my ($sProgAndDb,$sParam) = &BlastParamStr (%ProgOpt);
  $sProgAndDb or return;
  my $CallBlast = "$sProgAndDb $$pSeq{SrcTmp} $sParam";

  # get/create output file handle
  my ($PathBlastPlain,$hOut);
  if ($ProgOpt{-OutTab}) {
    $hOut = $ProgOpt{OutTabHandle};
  } else {
    $PathBlastPlain = &PrepOutpath ($$pSeq{SrcPath},
      $$pSeq{SrcMulti}?&SeqidWord($$pSeq{id}):'', -debug=>$dbg2);
    $PathBlastPlain or return;
    $hOut = FileHandle->new($PathBlastPlain,'w') or return;
  }

  { # retry block

    # call BLAST, read from streams
    my $hInBlast;
    my $bRedo = 0;
    $debug and printf STDERR "%s. calling command '$CallBlast'\n", &MySub;
    unless ($hInBlast = FileHandle->new("$CallBlast |")) {
      printf "%s. ERROR: unable to start BLAST process, call was:\n  %s\n", &MySub,
        $CallBlast;
      return;
    }
    while (<$hInBlast>) {
      if (m/^FATAL:  .*(ExpandX|WordFinderSearch)/) { $bRedo=1 }
      print  $hOut $_;
    }
    $ProgOpt{-OutTab} and print  $hOut "//\n";

    # BLAST error handling, clean temporary path
    if ($bRedo) {
      $debug and printf STDERR "%s. BLAST runtime error, repeating BLAST search\n", &MySub;
      redo;
    }
  } # end retry block
}


# do BLAST, produce WWW output
#
# INTERFACE
# - argument 1: reference to sequence data structure (query seq)
#
# - global options:
#   ...         for BLAST parameter options see &SeqLab::Blast::BlastParamStr
#   ...         for global file output options see &PrepOutpath
#   -debug      [STD], leave temporary files
#   -extern     extern HTML access (WWW):
#               - no link to query file on local file system.
#               - nebulise database path
#
# DESCRIPTION
# - output consists of a HTML-formatted report (file PathInput.html) linked
#   to a newly generated png (file PathInput.png).
#
sub BlastHtml {
  my ($pSeq) = @_;

  # function parameters
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;
  my ($sProgAndDb,$sParam) = &BlastParamStr (%ProgOpt);
  $sProgAndDb or return;
  my $CallBlast = "$sProgAndDb $$pSeq{SrcTmp} $sParam";
  my %path;
  $path{TmpGrp} = $$pSeq{SrcTmpGrp};
  $path{BlastPlain} = $path{TmpGrp}->Create('.bl');
  $path{BlastFinal} = &PrepOutpath ($$pSeq{SrcPath},
    $$pSeq{SrcMulti}?&SeqidWord($$pSeq{id}):'', -OutSuffix=>'.html', -debug=>$dbg2);
  $path{ImgFinal} = &PrepOutpath ($$pSeq{SrcPath},
    $$pSeq{SrcMulti}?&SeqidWord($$pSeq{id}):'', -OutSuffix=>'.png', -debug=>$dbg2);
  unless ($path{BlastFinal} and $path{ImgFinal}) { return }
  my %sect; $sect{time} = &TimeStr();

  # call BLAST, read from streams
  # the plain BLAST report serves two purposes:
  # - produce png image via BLAST data structure
  # - split into report sections
  $path{error} = $path{TmpGrp}->Create('.err');
  $debug and printf STDERR "%s. calling command '$CallBlast'\n", &MySub;
  if (int (system ("$CallBlast >$path{BlastPlain} 2>$path{error}") / 256)) {
    printf STDERR "%s. ERROR in BLAST process\n", &MySub;
    printf STDERR "  call: '$CallBlast'\n  error output file: $path{error}\n";
    return;
  }

  # produce image
  # retrieve report data structure
  my $pBlParse;
  ($path{ImgPrim},$path{map},$pBlParse) = &BlastImg ($path{BlastPlain}, -debug=>$dbg2);
  foreach (grep{ $_ ne 'SrcTmpGrp' } keys %$pSeq) {
    $$pBlParse{QuerySeq}{$_} ||= $$pSeq{$_};
  }

  # file I/O for final HTML output
  my $hOutHtml = FileHandle->new($path{BlastFinal},'w');
  unless ($hOutHtml) {
    printf STDERR "%s. ERROR: unable to write final HTML output to %s\n", &MySub, $path{BlastFinal}||"''";
    return;
  }
  my $hInRawhtml;
  {
    ($path{BlastDB}) = ($sProgAndDb=~m/\s+(\S+)\s*$/);
    $path{BlastHtml} = &Blast2Html($path{BlastPlain},-debug=>$dbg2);
    unless ($hInRawhtml = FileHandle->new($path{BlastHtml}, 'r')) {
      printf STDERR "%s. ERROR: unable to read Blast2Html file %s\n", &MySub, $path{BlastHtml}||"''";
      return;
    }
  }

  # split BLAST report into sections
  ($sect{query},$sect{prog},$sect{db}) = &BlastSections ($path{BlastPlain}, -debug=>$dbg2);

  # HTML header:
  # - HTML header containing title
  # - query section
  # - database section
  $sect{title} = $ProgOpt{-extern} ? "BLAST Result INSERTQUERYID" :
    "BLAST Report for $$pBlParse{QuerySeq}{id}";
  print $hOutHtml <<"HTML_END";
<HTML><HEAD>
  <TITLE>$sect{title}</TITLE>
</HEAD>
</BODY>
HTML_END

  # HTML report: sections 'Query', 'Database', headline section 'Report'
  if ($ProgOpt{-extern}) {
    $sect{db} =~ s|(Database:  ).*/|$1|;
    print $hOutHtml <<"HTML_END";
<H2>Query</H2>
<PRE> <!-- my pid was $$ -->
$sect{query}
Time:   $sect{time}
</PRE>
<H2>Database</H2>
<PRE>$sect{db}
</PRE>
<H2>BLAST Report</H2>
HTML_END
  } else {
    print  $hOutHtml <<"HTML_END";
<H2>Query</H2>
<PRE>$sect{query}
HTML_END
    print  $hOutHtml "File:   <A HREF=\"$$pBlParse{QuerySeq}{SrcPath}\">$$pBlParse{QuerySeq}{SrcPath}</A>\n"
      if (! $ProgOpt{-parse});
    print  $hOutHtml <<"HTML_END";
Time:   $sect{time}
</PRE>
<H2>Database</H2>
<PRE>$sect{db}
</PRE>
<H2>BLAST Report</H2>
HTML_END
  }

  # skip plain BLAST report until HSP index
  my $line;
  while (defined ($line=<$hInRawhtml>) and $line!~m/[\. ]+done$/) { }

  # HTML report: graphical representation (section 'Report')
  if ($path{ImgPrim} and !$$pBlParse{FlagNoHit}) {
    &FileCopy ($path{ImgPrim},$path{ImgFinal}) and unlink $path{ImgPrim};
    $path{HtmlImg} = $ProgOpt{-extern} ? 'INSERTIMG' : $path{ImgFinal};
    print  $hOutHtml "<IMG SRC=\"$path{HtmlImg}\" USEMAP=\"#HSPMAP\" BORDER=0>\n";
    my $hInMap = FileHandle->new($path{map},'r');
    unless ($hInMap) {
      printf STDERR "%s. ERROR: unable to read image map file %s\n", &MySub, $path{map}||"''";
      return;
    };
    while (defined ($line=<$hInMap>)) { print $hOutHtml $line }
  }
  elsif ($debug) {
    printf STDERR "%s. ERROR: graphical output not available, %s\n", &MySub,
      $$pBlParse{FlagNoHit} ? 'no matches' : 'error in &BlastImg';
  }

  # HTML report: plain report (section 'Report'), section 'BLAST Parameters'
  print  $hOutHtml "<PRE>";
  while (defined ($line=<$hInRawhtml>)) { print $hOutHtml $line }

  # HTML report: section 'BLAST Program'
  print  $hOutHtml <<"HTML_END";
<H2>BLAST Program</H2>
<PRE>$sect{prog}
</PRE>
HTML_END

  # end HTML
  print  $hOutHtml "</BODY></HTML>\n";

  # tidy up
  $debug or unlink(grep{ length() } grep{ defined($_) } $path{BlastHtml},$path{map});
  &DataDecross ($pBlParse);
}


################################################################################
# data flow architecture
################################################################################


# get BLAST report data structure according to parameters and filters
#
# INTERFACE
# - global options:
#   ...         for BLAST parameter options see &SeqLab::Blast::BlastParamStr
#   -debug      [STD]
#   -parse      [STD]
#   -OutColAdd  derive &BlastParse option -align
#   -OutColHide derive &BlastParse option -complex
#   -ThreshCplx derive &BlastParse option -complex
#
# - return val: - reference to BLAST result data structure, expanded by fields -
#                 in case of report input:
#                   SrcPath         from &SeqLab::Blast::BlastParamStr
#                 in case of sequence input:
#                   QueryProgAndDb  from &SeqLab::Blast::BlastParamStr
#                   QueryParamEff   from &SeqLab::Blast::BlastParamStr
#                   QueryParamStr   from &SeqLab::Blast::BlastParamStr
#                   QuerySeq        enriched sequence data structure
#                   SeqType         from &SeqLab::Blast::BlastParamStr
#                                   sequence types of db, query etc.
#               - undef if end of input is reached or an error occurs
#
sub GetBlast {
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;
  my %hide = map { ($_=>1) } @{$ProgOpt{-OutColHide}};

  # read ready BLAST result
  my $pBlParse;
  if ($ProgOpt{-parse}) {
    my ($pHdl);
    if (not $pHdl=&HdlQueue()) { return undef }
    $pBlParse = &BlastParse ($pHdl,
      -align   => int(grep /align/,@{$ProgOpt{-OutColAdd}})?2:0,
      -complex => (!$hide{hsp_complex} or exists($ProgOpt{-ThreshCplx})),
      -debug   => $dbg2);
    # expand result data structure
    $$pBlParse{SrcPath} = $ProgParam{store}{queue}{ReportPath};
    if ($$pBlParse{SrcPath} eq '-') { $$pBlParse{SrcPath}='stdin' }
    if (!length($$pBlParse{QueryProgAndDb})) {
      $$pBlParse{QueryProgAndDb}=$$pBlParse{program}.$$pBlParse{PathDb};
    }
  }

  # do BLAST and read result
  else {
    my $pSeq;
    if (not $pSeq=&SeqQueue()) { return undef }
    $pBlParse = &GetBlastByseq($pSeq);
  }

  return $pBlParse;
}


# get BLAST report data structure for query sequence
#
# INTERFACE
# - argument 1: reference to sequence data structure (query seq)
#
# - global options:
#   ...         for BLAST parameter options see &SeqLab::Blast::BlastParamStr
#   -debug      [STD]
#   -OutColAdd  derive &BlastParse option -align
#   -OutColHide derive &BlastParse option -complex
#   -ThreshCplx derive &BlastParse option -complex
#
# - return val: - reference to BLAST result data structure, expanded by fields:
#                   QueryProgAndDb  from &SeqLab::Blast::BlastParamStr
#                   QueryParamEff   from &SeqLab::Blast::BlastParamStr
#                   QueryParamStr   from &SeqLab::Blast::BlastParamStr
#                   QuerySeq        enriched sequence data structure from queue
#                   SeqType         from &SeqLab::Blast::BlastParamStr
#                                   sequence types of db, query etc.
#               - undef if an error occurs
#
# DEBUG, CHANGES, ADDITIONS
# - $sProgAndDb, $sParam, $pParamEff, $pSeqType should become part of
#   the BLAST report data structure. When ready BLAST results are read, the
#   parameters MUST be determined from the report itself.
#
sub GetBlastByseq {
  my $pSeq = shift() or return undef;
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;

  # prepare BLAST call
  my ($sProgAndDb,$sParam,$pParamEff,$pSeqType)
    = &BlastParamStr (%ProgOpt, -debug=>$dbg2);
  $sProgAndDb or return undef;
  my $CallBlast = "$sProgAndDb $$pSeq{SrcTmp} $sParam";

  # get BLAST result data structure
  my $pBlParse = &BlastStructBycall ($CallBlast,
    -align     => int(grep /align/,@{$ProgOpt{-OutColAdd}})?2:0,
    -complex   => exists($ProgOpt{-ThreshCplx}),
    -WarnFatal => 1,
    -debug     => $dbg2);
  # expand result data structure
  $$pBlParse{SeqType} = $pSeqType;
  foreach (grep{ $_ ne 'SrcTmpGrp' } keys %$pSeq) {
    $$pBlParse{QuerySeq}{$_} ||= $$pSeq{$_};
  }
  $$pBlParse{QueryProgAndDb} = $sProgAndDb;
  $$pBlParse{QueryParamStr} = $sParam;
  $$pBlParse{QueryParamEff} = $pParamEff;

  return $pBlParse;
}


# get BLAST matches according to parameters and filters
#
# INTERFACE
# - argument 1:  reference to BLAST report data structure
#
# - global options:
#   ...          for BLAST parameter options see &SeqLab::Blast::BlastParamStr
#   -debug       [STD]
#   -FilterOlap  [STD]
#   -FilterSelf  [STD]
#   -ThreshCplx  [STD]
#   -ThreshId    [STD]
#   -ThreshLen   [STD]
#
# - return val: - reference to finally selected matches, enbodied by match data
#                 tree structure expanded by fields:                          
#                 orient          value of rel. orientation if consistent  
#                                 over query/match sequence pair           
#               - undef if an error occurs
#
# DESCRIPTION
# - $pMatch ($ret[0]) will reference an excerpt of the original BLAST report
#   data structure (values %{$$pBlParse{Match}}). The original data structure
#   will be left unchanged (though targeted by referencing in $pMatch).
# - IMPORTANT: you will need to call &DataDecross($pBlParse) to correctly
#   free memory when dropping the data.
#
# DEBUG, CHANGES, ADDITIONS
# - update central fields of the data structure according to selection and
#   filtering. But if we realise this feature, original data under $pBlParse
#   will be irreversibly changed.
#
sub GetMatches {
  my ($pBlParse) = @_;
  my $debug = $ProgOpt{-debug};

  # loop over match entries: filter
  my ($CtMatch, $ItFilter, @MatchSlc);
  GetMatchesFlt:
  foreach my $pMatch ( map { $$pBlParse{Match}{$_} } sort {
    $$pBlParse{Match}{$b}{score} <=> $$pBlParse{Match}{$a}{score};
    } keys %{$$pBlParse{Match}}) {

    # filter self-match
    if ($ProgOpt{-FilterSelf} and $$pMatch{id} eq $$pBlParse{QuerySeq}{id}) { next }

    # filter applying thresholds
    if (($$pMatch{complex}||0) < ($ProgOpt{-ThreshCplx}||0)) { next }
    if ( $$pMatch{MaxHspLen}   < ($ProgOpt{-ThreshLen}||0))  { next }
    if ( $$pMatch{RelId}       < ($ProgOpt{-ThreshId}||0))   { next }
    if ( $$pMatch{score}       < ($ProgOpt{-ValS}||0))       { next }

    # filter overlaps
    if ($ProgOpt{-FilterOlap}) {
      for ($CtMatch=0; $CtMatch<@MatchSlc; $CtMatch++) {
        if (Math::Range->new(@{$MatchSlc[$CtMatch]}{'QueryBeg','QueryEnd'})->overlaps($$pMatch{QueryBeg},$$pMatch{QueryEnd})) {
          foreach $ItFilter (@{$ProgOpt{-FilterOlap}}) {
            if ($$pMatch{$ItFilter} < $MatchSlc[$CtMatch]{$ItFilter}) { next GetMatchesFlt }
            if ($$pMatch{$ItFilter} > $MatchSlc[$CtMatch]{$ItFilter}) {
              splice @MatchSlc, $CtMatch, 1;
              $CtMatch --;
              last;
            }
          }
        }
      }
    }

    # add orientation info
    $$pMatch{orient} = &DataTreeSlc ($$pMatch{HSP}, [[undef,'all'],['orient']], -unique=>1);
    $$pMatch{orient} = (int(@{$$pMatch{orient}})>1) ? '.' : $$pMatch{orient}[0];

    # enter match
    push @MatchSlc, $pMatch;
  }

  # exit SUB
  return \@MatchSlc;
}


# get BLAST HSPs according to parameters and filters
#
# INTERFACE
# - argument 1:  reference to BLAST report data structure
#
# - global options:
#   ...          for BLAST parameter options see &SeqLab::Blast::BlastParamStr
#   -debug       [STD]
#   -FilterOlap  [STD]
#   -FilterSelf  [STD]
#   -ThreshCplx  [STD]
#   -ThreshId    [STD]
#   -ThreshLen   [STD]
#
# - return val: - reference to finally selected HSPs
#               - undef if an error occurs
#
# DESCRIPTION
# - $pHsp ($ret[0]) will reference an excerpt of the original BLAST report
#   data structure (values %{$$pBlParse{Match}}). The original data structure
#   will be left unchanged (though targeted by referencing in $pHsp).
# - IMPORTANT: you will need to call &DataDecross($pBlParse) to correctly
#   free memory when dropping the data.
#
# DEBUG, CHANGES, ADDITIONS
# - update central fields of the data structure according to selection and
#   filtering. But then, original data under $pBlParse will be irreversibly
#   changed.
#
sub GetHsps {
  my ($pBlParse) = @_;
  my $debug = $ProgOpt{-debug};
  if (! $$pBlParse{Match}) {
    printf STDERR "%s. WARNING: query seq statement missing in BLAST report data structure, possibly program error\n", &MySub;
  }

  # loop over match entries: filter
  my ($pHsp, $CtHsp, $CtHspOld, $ItFilter, @HspSlc);
  foreach my $pMatch ( map { $$pBlParse{Match}{$_} } sort {
    $$pBlParse{Match}{$b}{score} <=> $$pBlParse{Match}{$a}{score};
    } keys %{$$pBlParse{Match}}) {

    # filter self-match
    if ($ProgOpt{-FilterSelf} and $$pMatch{id} eq $$pBlParse{QuerySeq}{id}) { next }

    # filter applying thresholds
    # pre-selection on matches should make the whole procedure faster
    if (($$pMatch{complex}||0) < ($ProgOpt{-ThreshCplx}||0)) { next }
    if ( $$pMatch{RelId}       < ($ProgOpt{-ThreshId}||0))   { next }
    if ( $$pMatch{MaxHspLen}   < ($ProgOpt{-ThreshLen}||0))  { next }

    # loop over HSPs
    GetHspsFlt:
    for ($CtHsp=0; $CtHsp<@{$$pMatch{HSP}}; $CtHsp++) {
      $pHsp = $$pMatch{HSP}[$CtHsp];
      $$pHsp{idnum} = $CtHsp;

      # filter applying thresholds
      if (($$pHsp{complex}||0) < ($ProgOpt{-ThreshCplx}||0)) { next }
      if ( $$pHsp{RelId}       < ($ProgOpt{-ThreshId}||0))   { next }
      if ( $$pHsp{QueryLen}    < ($ProgOpt{-ThreshLen}||0))  { next }
      if ( $$pHsp{score}       < ($ProgOpt{-ValS}||0))       { next }
        # HSPs may have lower score than specified S
      if (($ProgOpt{-ValE}||0) and $$pHsp{expect}>$ProgOpt{-ValE}) { next }
        # HSPs may have higher expectancy than specified E

      # filter overlaps
      if ($ProgOpt{-FilterOlap}) {
        for ($CtHspOld=0; $CtHspOld<@HspSlc; $CtHspOld++) {
          if (Math::Range->new(@{$HspSlc[$CtHspOld]}{'QueryBeg','QueryEnd'})->overlaps(@{$pHsp}{'QueryBeg','QueryEnd'})) {
            foreach $ItFilter (@{$ProgOpt{-FilterOlap}}) {
              if ($$pHsp{$ItFilter} <= $HspSlc[$CtHspOld]{$ItFilter}) {
                $debug and printf STDERR "%s. filtering next HSP, ID %s, no. %d\n", &MySub,
                  $$pHsp{MatchR}{id}||"''", $$pHsp{idnum}||"''";
                next GetHspsFlt;
              } else {
                $debug and printf STDERR "%s. filtering former HSP, ID %s, no. %d\n", &MySub,
                  $HspSlc[$CtHspOld]{MatchR}{id}||"''", $HspSlc[$CtHspOld]{idnum}||"''";
                splice @HspSlc, $CtHspOld, 1;
                $CtHspOld --;
                last;
              }
            }
          }
        }
      }

      # enter HSP
      push @HspSlc, $pHsp;
    }
  }

  # exit SUB
  return \@HspSlc;
}


################################################################################
# data structure-oriented output
################################################################################


# report BLAST hits as data structure-oriented output
#
# INTERFACE
# - argument 1: referenced BLAST report data structure (from &GetBlast)
#
# - global options:
#   ...         for BLAST parameter options see &SeqLab::Blast::BlastParamStr
#   ...         for selection and filter options see &GetMatches
#   ...         for global file output options see &PrepOutpath
#   -debug      [STD], leave temporary files
#   -OutTab     [STD]
#
sub BlastData {
  my ($pBlParse) = @_;
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug - 1 : undef;

  # get BLAST result data structure
  my $pMatchSlc;
  ($pMatchSlc,$pBlParse) = &GetMatches ($pBlParse);
  $pMatchSlc or return;
  $$pBlParse{Match} = $pMatchSlc;

  # output directives
  my ($PathOut,$hOut);
  if ($ProgOpt{-OutTab}) {
    $hOut = $ProgOpt{OutTabHandle};
  } else {
    $PathOut = &PrepOutpath ($$pBlParse{SrcPath}||$$pBlParse{QuerySeq}{SrcPath},
      &SeqidWord($$pBlParse{QuerySeq}{id}), -OutSuffix=>'.bll', -debug=>$dbg2);
    $debug and print  STDERR "%s. writing file %s\n", &MySub,
      $PathOut||"- failed, no path";
    $PathOut or return;
    $hOut = FileHandle->new($PathOut,'w') or return;
  }
  # output
  &DataPrint ($pBlParse, -handle=>$hOut);

  # tidy up
  &DataDecross ($pBlParse);
}


# enter BLAST result graph of HSPs
#
# INTERFACE
# - argument 1: referenced BLAST report data structure (from &GetBlast)
#
# - global options:
#   ...         for BLAST parameter options see &SeqLab::Blast::BlastParamStr
#   ...         for selection and filter options see &GetHsps
#   ...         for global file output options see &PrepOutpath
#   -debug      [STD], leave temporary files
#   -OutTab     [STD]
#
sub BlastCrossHsp {

  # function parameters
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug - 1 : undef;
  my %hide = map { ($_=>1) } @{$ProgOpt{-OutColHide}};
  if (exists($hide{seqlen}) and $hide{seqlen}) { $hide{length}=1 }
  my %add = map { ($_=>1) } @{$ProgOpt{-OutColAdd}};
    # program option -OutColAdd=align will also take effect in
    # &GetBlast/&GetByseq
  my @SeqField = grep {!$hide{$_}} qw{id descr length};

  my $pSeqIdx = {};
  my ($pBlParse, $pSeqQuery, $pSeqMatch);

  # loop over queries -> reports
  while ($pBlParse = &GetBlast()) {
    if (exists($$pSeqIdx{$$pBlParse{QuerySeq}{id}}) and
      $$pSeqIdx{$$pBlParse{QuerySeq}{id}}{isQuery}
    ) {
      printf STDERR "WARNING: HSP entry already exists for seq %s\n", $$pBlParse{QuerySeq}{id};
      printf STDERR "  probably duplicated query sequence\n";
      next;
    }

    # enter query sequence to index of sequences
    my $pHspSlc = &GetHsps($pBlParse) or next;
    $pSeqQuery = $$pSeqIdx{$$pBlParse{QuerySeq}{id}} ||= { };
    if (! %$pSeqQuery) {
      foreach (@SeqField) {
        $$pSeqQuery{$_} = $$pBlParse{QuerySeq}{$_};
      }
    }
    $$pSeqQuery{isQuery} = 1;
      # sign sequence as being queried
      # This flag is useful in addition to @{$$pSeqQuery{HSP}} because a query does
      #   not necessarily produces matches.

    # loop over HSP entries
    foreach my $pHsp (@$pHspSlc) {

      # enter matching sequence to index of sequences
      $pSeqMatch = $$pSeqIdx{$$pHsp{MatchR}{id}} ||= { };
      if (! %$pSeqMatch) {
        foreach (@SeqField) {
          $$pSeqMatch{$_} = $$pHsp{MatchR}{$_};
        }
      }
      $$pSeqMatch{isMatch} = 1;

      # enter HSP
      if (%hide) {
        if ($hide{range}||$hide{queryseq_range}) {
          delete $$pHsp{QueryBeg};
          delete $$pHsp{QueryEnd};
        }
        if ($hide{range}||$hide{matchseq_range}) {
          delete $$pHsp{MatchBeg};
          delete $$pHsp{MatchEnd};
        }
      }
      if (exists($add{align}) and $add{align}) {
        @{$$pHsp{align}} = map { $_->{sequence} } @{$$pHsp{align}}[0,2];
      }
      push @{$$pSeqQuery{HSP}}, {
        %$pHsp,
        QueryR => $pSeqQuery,
        MatchR => $pSeqMatch,
        };
    }

    # tidy up
    &DataDecross ($pBlParse);
  }

  # get/create output file handle
  my ($PathOut, $hOut);
  if ($ProgOpt{-OutTab}) {
    $hOut = $ProgOpt{OutTabHandle};
  } elsif ($ProgOpt{-OutStump}) {
    $PathOut = &PrepOutpath ($ProgOpt{-OutStump}, undef,
      -OutSuffix=>'.dat', -debug=>$dbg2);
    $debug and print  STDERR "%s. writing file %s\n", &MySub,
      $PathOut||"- failed, no path";
    $PathOut or return;
    $hOut = FileHandle->new($PathOut,'w') or return;
  } else {
    $hOut = \*STDOUT;
  }
  # output
  &DataPrint ($pSeqIdx, -handle=>$hOut);
}


################################################################################
# list output
################################################################################


# report BLAST hits as list consisting of single line per query
#
# INTERFACE
# - argument 1: referenced BLAST report data structure (from &GetBlast)
#
# - global options:
#   ...         for BLAST parameter options see &SeqLab::Blast::BlastParamStr
#   ...         for selection and filter options see &GetMatches
#   -debug      [STD], leave temporary files
#
# DESCRIPTION
# - print one line of tabular summary for the whole BLAST report.
#   The super-tabular output is organized in MAIN.
# - This is a highly specialized procedure for finding new SimClusters.
#
sub BlastListQuery {
  my ($pBlParse) = @_;
  my $debug = $ProgOpt{-debug};
  my $hOut = $ProgOpt{OutTabHandle};
  my $SeqidMod = $SeqidFunc{$ProgOpt{-OutIdFmt}};
  my $TimeStart = &Sum ((times)[0,2]);

  # get BLAST result data structure
  my $pMatchSlc = &GetMatches($pBlParse) or return;

  # enter main results
  my %RsltLine;
  $RsltLine{id} = &$SeqidMod ($$pBlParse{QuerySeq}{id});
  $RsltLine{time} = &Sum ((times)[0,2]) - $TimeStart;
  $RsltLine{CtMatch} = int @$pMatchSlc;
  $RsltLine{score} = &Max (map { $_->{score} } @$pMatchSlc);
  $RsltLine{MaxHspLen} = &Max (map { $_->{MaxHspLen} } @$pMatchSlc);
  $RsltLine{RelId} = &Max (map { $_->{RelId} } @$pMatchSlc);
  $RsltLine{matches} = join (' ', do {
    my @id;
    foreach my $itId (map{ $_->{id} }@$pMatchSlc) {
      push @id, &$SeqidMod ($itId);
    }
    @id; });

  # output
  print  $hOut join ("\t",
    $RsltLine{id},
    sprintf ("%.3f", $RsltLine{time}),
    sprintf ("%d", $RsltLine{CtMatch}),
    $RsltLine{CtMatch} ? sprintf ("%d", $RsltLine{score}) : '',
    $RsltLine{CtMatch} ? sprintf ("%.3f", $RsltLine{RelId}) : '',
    $RsltLine{matches} || '',
    ), "\n";

  # tidy up
  &DataDecross ($pBlParse);
}


# report BLAST hits as list of matching database entries
#
# INTERFACE
# - argument 1: referenced BLAST report data structure (from &GetBlast)
#
# - global options:
#   ...         for BLAST parameter options see &SeqLab::Blast::BlastParamStr
#   ...         for selection and filter options see &GetMatches
#   ...         for global file output options see &PrepOutpath
#   -debug      [STD], leave temporary files
#   -OutTab     [STD]
#
# DEBUG, CHANGES, ADDITIONS
# - nothing at the moment
#
sub BlastListMatch {
  my ($pBlParse) = @_;
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug - 1 : undef;
  my %hide = map { ($_=>1) } @{$ProgOpt{-OutColHide}};
  unless ($hide{highest_complex}) { $ProgOpt{-ThreshCplx}||=0 }
  my $SeqidMod = $SeqidFunc{$ProgOpt{-OutIdFmt}};

  # get BLAST result data structure
  my $pMatchSlc = &GetMatches($pBlParse) or return;

  # output directives, table header
  my ($PathOut,$hOut);
  if ($ProgOpt{-OutTab}) {
    $hOut = $ProgOpt{OutTabHandle};
  } else {
    $PathOut = &PrepOutpath ($$pBlParse{SrcPath}||$$pBlParse{QuerySeq}{SrcPath},
      &SeqidWord($$pBlParse{QuerySeq}{id}), -OutSuffix=>'.bll', -debug=>$dbg2);
    $debug and print  STDERR "%s. writing file %s\n", &MySub,
      $PathOut||"- failed, no path";
    $PathOut or return;
    $hOut = FileHandle->new($PathOut,'w') or return;
  }
  # - start list output with header
  print  $hOut &PrepTabHeader($pBlParse);

  ##############################################################################
  # table body - GFF format

  if ($ProgOpt{-OutTabFmt} eq 'gff') {
    # column labelling
    printf $hOut "#\n# column labels:\n# %s\n", join ("\t",
      'query_id',
      'program_and_db',
      'feature',
      'offset',
      'end',
      'score',
      'orientation',
      'frame',
      'group',
      );

    # loop over match entries: output
    $$pBlParse{QueryProgAndDb} =~ s#^\s*\S+/##;
    $$pBlParse{QueryProgAndDb} =~ s# +(\S+/)?#_#;
    foreach my $pMatch (@$pMatchSlc) {
      print  $hOut join ("\t",
        &$SeqidMod($$pBlParse{QuerySeq}{id}),
        $$pBlParse{QueryProgAndDb}||'.',
        'match',
        $$pMatch{QueryBeg},
        $$pMatch{QueryEnd},
        $$pMatch{score},
        $$pMatch{orient},
        '.',
        &$SeqidMod($$pMatch{id}),
        ), "\n";
    }
  }

  ##############################################################################
  # table body - standard format

  else {
    # column labelling
    printf $hOut "#\n# column labels:\n# %s\n", join ("\t",
      $hide{queryseq_id} ? () : 'queryseq_id',
      ($hide{seqlen}||$hide{queryseq_len}) ? () : 'queryseq_len',
      ($hide{range}||$hide{queryseq_range}) ? () : 'queryseq_range',
      ($hide{descr}||$hide{queryseq_descr}) ? () : 'queryseq_descr',
                   'matchseq_id',
      ($hide{seqlen}||$hide{matchseq_len}) ? () : 'matchseq_len',
      ($hide{range}||$hide{matchseq_range}) ? () : 'matchseq_range',
      ($hide{descr}||$hide{matchseq_descr}) ? () : 'matchseq_descr',
                   'hsp_sum',
                   'highest_score',
                   'lowest_expect',
                   'highest_rel_identity',
      $hide{highest_complex} ? () : 'highest_complex',
      );

    # loop over match entries: output
    foreach my $pMatch (@$pMatchSlc) {
      print  $hOut join ("\t",
        $hide{queryseq_id} ? ()
                   : &$SeqidMod($$pBlParse{QuerySeq}{id}),
        ($hide{seqlen}||$hide{queryseq_len}) ? () : $$pBlParse{QuerySeq}{length},
        ($hide{range}||$hide{queryseq_range}) ? ()
                   : sprintf ("%d..%d", &Min (map { ($_->{QueryBeg}, $_->{QueryEnd}) } @{$$pMatch{HSP}}), &Max (map { ($_->{QueryBeg}, $_->{QueryEnd}) } @{$$pMatch{HSP}})),
        ($hide{descr}||$hide{queryseq_descr}) ? () : $$pBlParse{QuerySeq}{descr},
                     &$SeqidMod($$pMatch{id}),
        ($hide{seqlen}||$hide{matchseq_len}) ? () : $$pMatch{length},
        ($hide{range}||$hide{matchseq_range}) ? ()
                   : sprintf ("%d..%d", &Min (map { ($_->{MatchBeg}, $_->{MatchEnd}) } @{$$pMatch{HSP}}), &Max (map { ($_->{MatchBeg}, $_->{MatchEnd}) } @{$$pMatch{HSP}})),
        ($hide{descr}||$hide{matchseq_descr}) ? () : $$pMatch{descr},
                     int @{$$pMatch{HSP}},
                     $$pMatch{score},
                     $$pMatch{expect},
                     sprintf ("%.3f", $$pMatch{RelId}),
        $hide{highest_complex} ? () : $$pMatch{complex},
        ), "\n";
    }
  }

  # tidy up
  &DataDecross ($pBlParse);
}


# report BLAST hits as list of IDs of matching database entries
#
# INTERFACE
# - argument 1: referenced BLAST report data structure (from &GetBlast)
# - global options:
#   ...         for BLAST parameter options see &SeqLab::Blast::BlastParamStr
#   ...         for selection and filter options see &GetMatches
#   ...         for global file output options see &PrepOutpath
#   -debug      [STD], leave temporary files
#   -OutTab     [STD]
#
sub BlastListId {
  my ($pBlParse, %opt) = @_;
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug - 1 : undef;

  # get BLAST result data structure
  my $pMatchSlc = &GetMatches($pBlParse) or return;

  # get/create output file handle
  my ($PathOut);
  if (! $ProgOpt{-OutTab}) {
    $PathOut = &PrepOutpath ($$pBlParse{SrcPath}||$$pBlParse{QuerySeq}{SrcPath},
      &SeqidWord($$pBlParse{QuerySeq}{id}), -OutSuffix=>'.bll', -debug=>$dbg2);
    $debug and print  STDERR "%s. writing file %s\n", &MySub,
      $PathOut||' failed, missing path';
  }

  # output IDs of matches
  $ProgParam{store}{queue}{SeqOut}->AddSwitch(-file=>$ProgOpt{-OutTab}||$PathOut);
  $ProgParam{store}{queue}{SeqOut}->Push(@$pMatchSlc);

  # tidy up
  &DataDecross ($pBlParse);
}


# output BLAST result as a list of HSPs
#
# INTERFACE
# - argument 1: referenced BLAST report data structure (from &GetBlast)
#
# - global options:
#   ...         for BLAST parameter options see &SeqLab::Blast::BlastParamStr
#   ...         for selection and filter options see &GetHsps
#   ...         for global file output options see &PrepOutpath
#   -debug      [STD], leave temporary files
#   -OutColAdd  [STD]
#   -OutColHide [STD]
#   -OutTab     [STD]
#
sub BlastListHsp {
  my ($pBlParse) = @_;
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug - 1 : undef;
  my %hide = map { ($_=>1) } @{$ProgOpt{-OutColHide}};
  my %add = map { ($_=>1) } @{$ProgOpt{-OutColAdd}};
  unless ($hide{hsp_complex}) { $ProgOpt{-ThreshCplx}||=0 }
  my $SeqidMod = $SeqidFunc{$ProgOpt{-OutIdFmt}};

  # get BLAST result data structure
  my $pHspSlc = &GetHsps($pBlParse) or return;

  ##############################################################################
  # output directives, table header

  # get/create output file handle
  my ($PathOut,$hOut);
  if ($ProgOpt{-OutTab}) {
    $hOut = $ProgOpt{OutTabHandle};
  } else {
    $PathOut = &PrepOutpath ($$pBlParse{SrcPath}||$$pBlParse{QuerySeq}{SrcPath},
      &SeqidWord($$pBlParse{QuerySeq}{id}), -OutSuffix=>'.bll', -debug=>$dbg2);
    $debug and print  STDERR "%s. writing file %s\n", &MySub,
      $PathOut||"- failed, no path";
    $PathOut or return;
    $hOut = FileHandle->new($PathOut,'w') or return;
  }

  # start list output with header
  print  $hOut &PrepTabHeader($pBlParse);

  ##############################################################################
  # table body - GFF format

  if ($ProgOpt{-OutTabFmt} eq 'gff') {
    # column labelling
    printf $hOut "#\n# column labels:\n# %s\n", join ("\t",
      'query_id',
      'program_and_db',
      'feature',
      'offset',
      'end',
      'score',
      'orientation',
      'frame',
      'group',
      );

    # loop over HSP entries: output
    $$pBlParse{QueryProgAndDb} =~ s#^\s*\S+/##;
    $$pBlParse{QueryProgAndDb} =~ s# +(\S+/)?#_#;
    foreach my $pHsp (@$pHspSlc) {
      print  $hOut join ("\t",
        &$SeqidMod($$pBlParse{QuerySeq}{id}),
        $$pBlParse{QueryProgAndDb}||'.',
        'HSP',
        $$pHsp{QueryBeg},
        $$pHsp{QueryEnd},
        $$pHsp{score},
        &SignChar ($$pHsp{orient}, '-allow0'=>1),
        $$pHsp{QueryFr},
        &$SeqidMod($$pHsp{MatchR}{id}),
        ), "\n";
    }
  }

  ##############################################################################
  # table body - standard format

  else {
    # column labelling
    printf $hOut "#\n# column labels:\n# %s\n", join ("\t",
      $hide{queryseq_id} ? () : 'queryseq_id',
      ($hide{seqlen}||$hide{queryseq_len}) ? () : 'queryseq_len',
      ($hide{range}||$hide{queryseq_range}) ? () : 'queryseq_range',
      ($$pBlParse{CompType} eq 'protein') ? 'queryseq_frame' : (),
      ($hide{descr}||$hide{queryseq_descr}) ? () : 'queryseq_descr',
                   'matchseq_id',
      ($hide{seqlen}||$hide{matchseq_len}) ? () : 'matchseq_len',
      ($hide{range}||$hide{matchseq_range}) ? () : 'matchseq_range',
      ($$pBlParse{CompType} eq 'protein') ? 'matchseq_frame' : (),
      ($hide{descr}||$hide{matchseq_descr}) ? () : 'matchseq_descr',
                   'hsp_nmb',
                   'hsp_score',
                   'hsp_expect',
                   'hsp_rel_identity',
      $hide{hsp_complex} ? () : 'hsp_complex',
                   'hsp_len',
                   'orientation',
      $add{align} ? qw(align_query align_comp align_match) : (),
      );

    # loop over HSP entries: output
    foreach my $pHsp (@$pHspSlc) {
      print  $hOut join ("\t",
        $hide{queryseq_id} ? ()
                   : &$SeqidMod($$pBlParse{QuerySeq}{id}),
        ($hide{seqlen}||$hide{queryseq_len}) ? () : $$pBlParse{QuerySeq}{length},
        ($hide{range}||$hide{queryseq_range}) ? () : "$$pHsp{QueryBeg}..$$pHsp{QueryEnd}",
        ($$pBlParse{CompType} eq 'protein') ? $$pHsp{QueryFr} : (),
        ($hide{descr}||$hide{queryseq_descr}) ? () : $$pBlParse{QuerySeq}{descr},
                     &$SeqidMod($$pHsp{MatchR}{id}),
        ($hide{seqlen}||$hide{matchseq_len}) ? () : $$pHsp{MatchR}{length},
        ($hide{range}||$hide{matchseq_range}) ? () : "$$pHsp{MatchBeg}..$$pHsp{MatchEnd}",
        ($$pBlParse{CompType} eq 'protein') ? $$pHsp{MatchFr} : (),
        ($hide{descr}||$hide{matchseq_descr}) ? () : $$pHsp{MatchR}{descr},
                     $$pHsp{idnum},
                     $$pHsp{score},
                     $$pHsp{expect},
                     sprintf ('%.3f', $$pHsp{RelId}),
        $hide{hsp_complex} ? () : $$pHsp{complex},
                     $$pHsp{QueryLen},
                     $$pHsp{orient},
        $add{align} ? (map { $_->{sequence} } @{$$pHsp{align}}[0..2]) : (),
        ), "\n";
    }
  }

  # tidy up
  &DataDecross ($pBlParse);
}


# output BLAST result as a plot of maximum local HSP score
#
# INTERFACE
# - argument 1: referenced BLAST report data structure (from &GetBlast)
#
# - global options:
#   ...         for BLAST parameter options see &SeqLab::Blast::BlastParamStr
#   ...         for selection and filter options see &GetHsps
#   ...         for global file output options see &PrepOutpath
#   -debug      [STD], leave temporary files
#   -ListScore  mode switch arg: recalculate scores
#               argument is lower case
#   -OutTab     [STD]
#
sub BlastListScore {
  my ($pBlParse) = @_;
  my @DefaultPlot = ( { score=>0 } ) x ($$pBlParse{QuerySeq}{length}+1);
  my %ActRecalc;
  $ActRecalc{log10} = sub {
    my $pHsp = $_[0];
    $$pHsp{score} = $$pHsp{score} ? &Max (log($$pHsp{score})/log(10)-1.5, 0) : 0;
    };

  # function parameters
  my $debug = $ProgOpt{-debug};
  my $dbg2  = $debug ? $debug - 1 : undef;
  my $SeqidMod = $SeqidFunc{$ProgOpt{-OutIdFmt}};

  # get BLAST result data structure
  my $pHspSlc = &GetHsps($pBlParse) or return;

  # translate HSPs to plot of scores
  my %plot;
  foreach my $pHsp (@$pHspSlc) {
    my $ItFrame = $$pHsp{QueryFr};
    $plot{$ItFrame} ||= &DataClone (\@DefaultPlot);
    for my $p ($$pHsp{QueryBeg} .. $$pHsp{QueryEnd}) {
      $plot{$ItFrame}[$p] = (
        sort { $$b{score} <=> $$a{score} } { %$pHsp }, $plot{$ItFrame}[$p]
        )[0];
    }
  }
  unless (keys %plot) {
    $plot{NONE} = &DataClone (\@DefaultPlot);
  }

  # recalculate scores?
  my $pActRecalc=$ActRecalc{$ProgOpt{-ListScore}};
  if ($pActRecalc and ref($pActRecalc) eq 'CODE') {
    foreach my $ItFrame (keys %plot) {
      foreach (my $i=0; $i<int(@{$plot{$ItFrame}}); ++$i) {
        &$pActRecalc ($plot{$ItFrame}[$i]);
      }
    }
  }

  ##############################################################################
  # output directives, table header

  # get/create output file handle
  my ($PathOut,$hOut);
  if ($ProgOpt{-OutTab}) {
    $hOut = $ProgOpt{OutTabHandle};
  } else {
    $PathOut = &PrepOutpath ($$pBlParse{SrcPath}||$$pBlParse{QuerySeq}{SrcPath},
      &SeqidWord($$pBlParse{QuerySeq}{id}), -OutSuffix=>'.bll', -debug=>$dbg2);
    $debug and print  STDERR "%s. writing file %s\n", &MySub,
      $PathOut||"- failed, no path";
    $PathOut or return;
    $hOut = FileHandle->new($PathOut,'w') or return;
  }

  # start list output with header
  print  $hOut &PrepTabHeader($pBlParse);

  ##############################################################################
  # table body - GFF format
  # - reduce to one-dimensional plot (winner of all frames)

  if ($ProgOpt{-OutTabFmt} eq 'gff') {
    # column labelling
    printf $hOut "#\n# column labels:\n# %s\n", join ("\t",
      'query_id',
      'program_and_db',
      'feature',
      'offset',
      'end',
      'score',
      'orientation',
      'frame',
      'group',
      );
    $$pBlParse{QueryProgAndDb} =~ s#^\s*\S+/##;
    $$pBlParse{QueryProgAndDb} =~ s# +(\S+/)?#_#;

    # combine frames
    my $Key1st = (sort keys %plot)[0];
    if (int (keys %plot) > 1) {
      for (my $i=0; $i<int(@{$plot{$Key1st}}); ++$i) {
        $plot{all}[$i] = (
          sort { $$b{score}<=>$$a{score} }
          map { $plot{$_}[$i] }
          keys %plot )[0];
      }
    } else {
      printf STDERR "one plot only: %s\n", $Key1st||"''";
      $plot{all} = $plot{$Key1st};
    }
    push @{$plot{all}}, { score=>0 };  # enforce output of very last hit (need a change in score finally)

    # work through whole query range
    my $pPlotGrp;
    for (my $p=0; $p<@{$plot{all}}; ++$p) {

      # flush homogeneous HSP range
      if ($pPlotGrp and (
            !$plot{all}[$p]{score} or
            $$pPlotGrp{frame} ne $plot{all}[$p]{QueryFr} or
            $$pPlotGrp{score} != $plot{all}[$p]{score} or 0)
      ) {
        print  $hOut join ("\t",
          &$SeqidMod($$pBlParse{QuerySeq}{id}),
          $$pBlParse{QueryProgAndDb}||'.',
          'HSP_score',
          $$pPlotGrp{offset},
          $$pPlotGrp{end},
          $$pPlotGrp{score},
          &SignChar ($$pPlotGrp{frame}),
          abs ($$pPlotGrp{frame}) - 1,
          $$pPlotGrp{group},  # we should enter the MatchID here, but we lost it
          ), "\n";
        undef $pPlotGrp;
      }

      # skip non-HSP ranges
      $plot{all}[$p]{score} or next;

      # group homogeneous HSP ranges
      if ($pPlotGrp) {
        $$pPlotGrp{end} ++;
      } else {
        $pPlotGrp = {
          offset => $p,
          end    => $p,
          frame  => $plot{all}[$p]{QueryFr},
          score  => $plot{all}[$p]{score},
          group  => &$SeqidMod($plot{all}[$p]{MatchR}{id}),
          };
      }
    }
  }

  ##############################################################################
  # table body - standard format

  else {
    # column labelling
    printf $hOut "#\n# column labels:\n# %s\n",
      join ("\t", qw(frame pos score match_id));
    # loop through plot: output
    foreach my $ItFrame (sort { int($a)<=>int($b) } keys %plot) {
      for (my $i=0; $i<int(@{$plot{$ItFrame}}); ++$i) {
        printf $hOut "%s\t%d\t%d\n", $ItFrame, $i,
          $plot{$ItFrame}[$i]{score},
          &$SeqidMod($plot{$ItFrame}[$i]{MatchR}{id});
      }
    }
  }

  # tidy up
  &DataDecross ($pBlParse);
}


################################################################################
# annotation and masking
################################################################################


# extract matching sequence range
#
# INTERFACE
# - argument 1: reference to BLAST result data structure
#
# - global options:
#   ...         for BLAST parameter options see &SeqLab::Blast::BlastParamStr
#   ...         for selection and filter options see &GetHsps
#   -debug      [STD], leave temporary files
#   -HitSurrd   [STD]
#   -OutSeq*    [STD]
#
sub BlastSeqHsp {
  my ($pBlParse) = @_;
  my $pSeq = $$pBlParse{QuerySeq};
  my $debug = $ProgOpt{-debug};
  $debug and printf STDERR "%s. query sequence %s\n", &MySub, $$pSeq{id}||"''";
  my $HitSurrd = $ProgOpt{-HitSurrd};

  # get BLAST result data structure
  my $pHspSlc = &GetHsps($pBlParse) or return;
  if (! grep { -r $_ } map { $$pBlParse{DbPath}.".$_" } 'xni', 'xpi') {
    printf STDERR "%s. ERROR: BLAST database %s is not indexed WashU 2.0 lic. format\n",
      &MySub, $$pBlParse{DbPath};
    print  STDERR map { '  '. $$pBlParse{DbPath}.".$_"."\n" } 'xni', 'xpi';
    return;
  }
  my $RequCall = $CorePath{call}{blast}{xdget} .' '.
    ($CorePath{call}{blast}{SeqType2xdSuffix}{$$pBlParse{SeqType}{db}}||'-n')
    .' '. $$pBlParse{DbPath};

  # loop over HSPs
  my (%RequSeq,$pHspGapSurrd);
  foreach my $pHsp (@$pHspSlc) {
    $debug and printf STDERR "%s. BLAST match %s, HSP %d\n", &MySub,
      $$pHsp{MatchR}{id}||"''", $$pHsp{idnum};

    # determine hit range (+ surrounding) of query sequence
    my $pHspRange = Math::Range->new($$pHsp{MatchBeg},$$pHsp{MatchEnd});
    if ($HitSurrd) {
      undef $pHspGapSurrd;
      if ($HitSurrd > $$pHsp{MatchBeg}-1) {
        $pHspRange->lower(1);
        $pHspGapSurrd = Math::Range->new(0,0);
        $pHspGapSurrd->lower($HitSurrd-$$pHsp{MatchBeg}+1);
      } else {
        $pHspRange->lower($pHspRange->lower()-$HitSurrd);
      }
      if ($HitSurrd > $$pHsp{MatchR}{length}-$$pHsp{MatchEnd}) {
        $pHspRange->upper($$pHsp{MatchR}{length});
        $pHspGapSurrd ||= Math::Range->new(0, 0);
        $pHspGapSurrd->upper($HitSurrd-$$pHsp{MatchR}{length}+$$pHsp{MatchEnd});
      } else {
        $pHspRange->upper($pHspRange->upper()+$HitSurrd);
      }
    }

    # retrieve matching sequence from indexed database
    my $pSeqMatch = $RequSeq{$$pHsp{MatchR}{id}}{seq}
      ||= &SeqentryPopFasta ($RequCall .' '. "'$$pHsp{MatchR}{id}'" .' |');
    $RequSeq{$$pHsp{MatchR}{id}}{ctacc} ++;
    if (! $$pSeqMatch{sequence}) {
      printf STDERR "%s. ERROR: retrieval failed for matching sequence %s\n", &MySub,
        $$pHsp{MatchR}{id};
      delete $RequSeq{$$pHsp{MatchR}{id}};
      next;
    } elsif ($debug) {
      printf STDERR "%s. retrieved matching sequence %s, access #%d, length %d\n", &MySub,
        $$pSeqMatch{id}, $RequSeq{$$pHsp{MatchR}{id}}{ctacc},
        length($$pSeqMatch{sequence});
    }
    # get shortened sequence ID from BLAST parsing
    $$pSeqMatch{id} = &{$SeqidFunc{$ProgOpt{-OutIdFmt}}} ($$pSeqMatch{id});
    delete $$pSeqMatch{header};

    # extract HSP sequence
    my $pSeqHsp = &SeqRange ($pSeqMatch, @$pHspRange);
    # eventually add flanking gaps to HSP sequence
    if ($HitSurrd and $pHspGapSurrd) {
      $$pSeqHsp{sequence}  = ('-' x $$pHspGapSurrd[-1]) . $$pSeqHsp{sequence};
      $$pSeqHsp{sequence} .= ('-' x $$pHspGapSurrd[1]);
    }
    # eventually reverse-complement HSP sequence
    if ($$pHsp{orient}<0) { $pSeqHsp=&SeqRevcompl($pSeqHsp) }
    # add HSP no. to sequence ID
    $$pSeqHsp{id} .= '.'. $RequSeq{$$pHsp{MatchR}{id}}{ctacc};

    # output sequence fragment
    $ProgParam{store}{queue}{SeqOut}->Push($pSeqHsp);
  }

  # tidy up
  &DataDecross ($pBlParse);
}


# enter annotations for sequence ranges that produce hits
#
# INTERFACE
# - argument 1: reference to BLAST result data structure
#
# - global options:
#   ...         for BLAST parameter options see &SeqLab::Blast::BlastParamStr
#   ...         for selection and filter options see &GetHsps
#   -debug      [STD], leave temporary files
#   -OutSeq*    [STD]
#   -SeqAnnot   @{$ProgOpt{-SeqAnnot}} contains the arguments of the ProgMode
#               switch
#
# - global data:
#   $ProgParam{MaskChar}  masking character according to query seq type
#
# DESCRIPTION
# - The annotated sequence will be written back to the file path of the
#   sequence source file. See options handed over to SeqStreamOut object.
#
sub BlastSeqAnnot {
  my ($pBlParse) = @_;
  my $pSeq = $$pBlParse{QuerySeq};
  my $debug = $ProgOpt{-debug};
  my $AnnotLabel = $ProgOpt{-SeqAnnot} || $ProgParam{default}{AnnotLabel};
  my $SmbMask = $ProgParam{MaskChar};
  unless ($SmbMask) {
    die sprintf "%s. ERROR: undefined masking character\n", &MySub;
  }
  $debug and printf STDERR "%s. annotating query sequence %s\n", &MySub,
    $$pSeq{id}||"''";

  # delete existing annotations
  $debug and printf STDERR "%s. deleting existing annotations labelled $AnnotLabel\n", &MySub;
  @{$$pSeq{annot}} = grep { $_->{type} ne $AnnotLabel } @{$$pSeq{annot}};

  ##############################################################################
  my $iBl=0;
  { # analyse BLAST, possibly do re-run

    # get BLAST result data structure
    my $pHspSlc = &GetHsps($pBlParse) or return;
    ++ $iBl;

    # loop over HSPs fulfilling match criteria
    my $iMasked = 0;
    my $bOverflow = 0;
    foreach my $pHsp (@$pHspSlc) {
      $debug and printf STDERR "%s. annotating query sequence according to BLAST match %s, HSP %d\n", &MySub,
        $$pHsp{MatchR}{id}||"''", $$pHsp{idnum};

      # in parallel: mask BLAST query sequence
      # this is needed for an eventual re-run of BLAST
      substr ($$pSeq{SeqPure}, $$pHsp{QueryBeg}-1, $$pHsp{QueryEnd}-$$pHsp{QueryBeg}+1)
        = $SmbMask x ($$pHsp{QueryEnd}-$$pHsp{QueryBeg}+1);
      $iMasked ++;

      # ... and enter annotations for matching entries
      # we regard possible discrepancies between original & pure sequence
      #   (BLASTed seq) via remapping in &SeqRangeGapped
      my $pHspRangeGapped = &SeqRangeGapped ($$pSeq{SeqPure}, $$pSeq{sequence}, [$$pHsp{QueryBeg}, $$pHsp{QueryEnd}]);
      push @{$$pSeq{annot}}, {
        type   => $AnnotLabel,
        orient => $$pHsp{orient},
        offset => $$pHspRangeGapped{'-1'},
        end    => $$pHspRangeGapped{'1'},
        text   => sprintf ("%s\ndescription=%s\nrange=%d..%d\nrel_identity=%.3f",
                  &{$SeqidFunc{$ProgOpt{-OutIdFmt}}}($$pHsp{MatchR}{id}),
                  $$pHsp{MatchR}{descr}, $$pHsp{MatchBeg}, $$pHsp{MatchEnd},
                  $$pHsp{RelId}),
        };

      # do we have to repeat BLAST/annotation cycle?
      # syntax 'int ($$pBlParse{QueryParamEff}{hspmax})' is crucial for
      #   success of comprison, don't know why.
      if (exists($$pBlParse{QueryParamEff}{hspmax}) and
        int(@{$$pBlParse{Match}{$$pHsp{MatchR}{id}}{HSP}}) >= int($$pBlParse{QueryParamEff}{hspmax})
        and !$bOverflow
      ) {
        $debug and printf STDERR "%s. first hspmax overflow (%d) with seq %s, match %s\n", &MySub,
          $$pBlParse{QueryParamEff}{hspmax}, $$pSeq{id}, $$pHsp{MatchR}{id}||"''";
        $bOverflow = 1;
      }
    }
    $debug and printf STDERR "%s. query sequence %s, Blast cycle %d\n", &MySub, $$pSeq{id}, $iBl;

    # debug after BLAST cycle
    if ($debug) {
      printf STDERR "%s. query sequence %s, BLAST cycle %d\n", &MySub, $$pSeq{id}, $iBl;
      printf STDERR "  additional masking in query sequence: %s\n", $iMasked ? 'YES':'NO';
      printf STDERR "  hspmax overflow (limit %d): %s\n",
        $$pBlParse{QueryParamEff}{hspmax}, $bOverflow ? 'YES':'NO';
      printf STDERR "  hit ID: HSPs total / HSPs considered\n%s", do {
        my %DebugHspGrp;
        map {
          $DebugHspGrp{$_->{MatchR}{id}}{total} = int @{$$pBlParse{Match}{$_->{MatchR}{id}}{HSP}};
          $DebugHspGrp{$_->{MatchR}{id}}{consid} ++;
        } @$pHspSlc;
        join ('', map { "    $_: $DebugHspGrp{$_}{total}/$DebugHspGrp{$_}{consid}\n" } keys %DebugHspGrp)
        };
    }

    # HSP overflow -> rewrite partially masked sequence and redo BLAST
    if ($bOverflow) {
      $debug and printf STDERR "%s. repeating BLAST due to overflow\n", &MySub;
      # repeat seq saving procedure which was already done in &SeqQueue
      &WriteFile ($$pSeq{SrcTmp},
        &SeqentryToFasta ($pSeq, -KeySeq=>'SeqPure', -pure=>undef));
      # repeat BLAST with higher hspmax value
      $ProgOpt{-Valhspmax} ||= &Max (20, $$pBlParse{QueryParamEff}{hspmax});
      $ProgOpt{-Valhspmax} *= 1.8;
      &DataDecross ($pBlParse);
      $pBlParse = &GetBlastByseq($pSeq);
      redo;
    }
  }

  # final output, tidy up
  $ProgParam{store}{queue}{SeqOut}->Push($pSeq);
  &DataDecross ($pBlParse);
}


# mask sequence ranges that produce hits
#
# INTERFACE
# - argument 1: reference to BLAST result data structure
#
# - global options:
#   ...         for BLAST parameter options see &SeqLab::Blast::BlastParamStr
#   ...         for selection and filter options see &GetHsps
#   -debug      [STD], leave temporary files
#   -OutSeq*    [STD]
#
# - global data:
#   $ProgParam{MaskChar}  masking character according to query seq type
#
sub BlastSeqMask {
  my ($pBlParse) = @_;
  my $pSeq = $$pBlParse{QuerySeq};
  my $debug = $ProgOpt{-debug};
  my $SmbMask = $ProgParam{MaskChar};
  unless ($SmbMask) {
    die sprintf "%s. ERROR: undefined masking character\n", &MySub;
  }
  $debug and printf STDERR "%s. masking query sequence %s\n", &MySub,
    $$pSeq{id}||"''";

  ##############################################################################
  # analyse BLAST, possibly do re-run
  my ($pHspSlc,$pHsp,$pHspRangeGapped,$bMasked,$bOverflow);
  {

    # get BLAST result data structure
    $pHspSlc = &GetHsps($pBlParse) or return;

    # loop over HSPs
    $bMasked = 0;
    $bOverflow = 0;
    foreach $pHsp (@$pHspSlc) {
      $debug and printf STDERR "%s. masking query sequence according to BLAST match %s, HSP %d\n", &MySub,
        $$pHsp{MatchR}{id}||"''", $$pHsp{idnum};

      # in parallel: mask BLAST query sequence
      # this is needed for an eventual re-run of BLAST
      substr ($$pSeq{SeqPure}, $$pHsp{QueryBeg}-1, $$pHsp{QueryEnd}-$$pHsp{QueryBeg}+1)
        = $SmbMask x ($$pHsp{QueryEnd}-$$pHsp{QueryBeg}+1);
      $bMasked ++;

      # ... and mask original sequence according to HSPs
      # we regard possible discrepancies between original & pure sequence
      #   (BLASTed seq) via remapping in &SeqRangeGapped
      $pHspRangeGapped = &SeqRangeGapped ($$pSeq{SeqPure}, $$pSeq{sequence}, [$$pHsp{QueryBeg}, $$pHsp{QueryEnd}]);
      substr ($$pSeq{sequence}, $$pHspRangeGapped{'-1'}-1, $$pHspRangeGapped{'1'}-$$pHspRangeGapped{'-1'}+1)
        =~ s/[a-zA-Z]/$SmbMask/g;

      # do we have to repeat BLAST/masking cycle?
      # syntax 'int ($$pBlParse{QueryParamEff}{hspmax})' is crucial for
      #   success of comprison, don't know why.
      if (exists($$pBlParse{QueryParamEff}{hspmax}) and
        int(@{$$pBlParse{Match}{$$pHsp{MatchR}{id}}{HSP}}) >= int($$pBlParse{QueryParamEff}{hspmax})
        and !$bOverflow
      ) {
        $debug and printf STDERR "%s. hspmax overflow (%d) with sequence %s, match %s\n", &MySub,
          $$pBlParse{QueryParamEff}{hspmax}, $$pSeq{id}, $$pHsp{MatchR}{id}||"''";
        $bOverflow = 1;
      }
    }
    $debug and printf STDERR "%s. query sequence %s, masked %d HSPs\n", &MySub,
      $$pSeq{id}, $bMasked;

    # HSP overflow while matches had to be considered
    # -> rewrite partially masked sequence and redo BLAST
    if ($bOverflow) {
      $debug and printf STDERR "%s. repeating BLAST due to overflow\n", &MySub;
      # repeat seq saving procedure which was already done in &SeqQueue
      &WriteFile ($$pSeq{SrcTmp},
        &SeqentryToFasta ($pSeq, -KeySeq=>'SeqPure', -pure=>undef));
      # repeat BLAST with higher hspmax value
      $ProgOpt{-Valhspmax} ||= &Max (20, $$pBlParse{QueryParamEff}{hspmax});
      $ProgOpt{-Valhspmax} *= 1.8;
      &DataDecross ($pBlParse);
      $pBlParse = &GetBlastByseq($pSeq);
      redo;
    }
  }

  # final output, tidy up
  $ProgParam{store}{queue}{SeqOut}->Push($pSeq);
  &DataDecross ($pBlParse);
}

# $Id: Blast.pl,v 1.49 2018/06/05 18:02:56 szafrans Exp $
