################################################################################
#
#  kPerl Sequence Laboratory
#  Library for Sequence BLASTing
#
#  template for the &BlastImg function by
#  Alessandro Guffanti at TIGEM, Milano (Italy), Aug. 1998,
#    guffanti@tigem.it
#
#  copyright (c)
#    Fritz Lipmann Institute Jena, Genome Analysis Group, 2006-2007
#    Institute of Molecular Biotechnology Jena, Dept. Genome Analysis, 1998-2003
#  author
#    Karol Szafranski, karol.szafranski@leibniz-fli.de
#
################################################################################
#
#  DESCRIPTION
#
# - individual description of functions at the beginning of the code blocks
#
# - supported BLAST program types and according BLAST database types:
#   NCBI 2.0 (=> NCBI 2.0)
#   WU 2.0 (=> NCBI 1.4)
#   see $_LibParam{'*Type*'} for type hierarchy definitions
#   see SeqHandle.pl -BlastDB for BLAST database types
#
################################################################################
#
#  FUNCTIONS, DATA
#
#   @EXPORT
#   %LibGlob
#   %_LibParam
#
# - basics
#   $ENV{'...'}
#   %BlastDefault
#   $_LibParam{ProgSub2SeqType}
#
# - BLAST call
#   $_LibParam{SeqType2ProgSub}
#   $_LibParam{SeqType2DbSuffix}
#   $_LibParam{ProgType2DbType}
#   $_LibParam{DbType2ProgType}
#   $_LibParam{DbSuffixHrch}
#   $_LibParam{DbSuffix2SeqType}
#   $_LibParam{DbSuffix2DbType}
#   $_LibParam{Option14to20}
#   $_LibParam{ParamSet}
#   &BlastParamStr
#
# - BLAST report parsing and interpretation
#   &BlastSections
#   parsed BLAST report (data structure)
#   &BlastParse
#   &BlastStructBycall
#   &BlastImg
#
# - BLAST report conversion
#   &Blast2Html
#
#
#  STD OPTIONS
#
#   -debug      print debug protocol to STDERR
#
################################################################################
#
#  DEBUG, CHANGES, ADDITIONS
#
# - look also for notes in the header of each function block
#
################################################################################

package SeqLab::Blast;

# includes
use strict; #use warnings;  # OK 20091027
use FileHandle;
use GD;  # this is not part of standard Perl distribution
  if ($GD::VERSION < 1.20 and ! $main::GlobStore{GdWarn}) {
    printf STDERR "WARNING: GD v%s doesn't support png\n", $GD::VERSION;
    $main::GlobStore{GdWarn} = 1;
  }
use MainLib::Data;
use MainLib::Graphics;
use MainLib::Path qw(%CorePath);
use MainLib::File qw(&WriteFile);
use MainLib::FileTmp qw(&PathUnique);
use MainLib::Misc qw(&MySub);
use Math::kCalc;
use SeqLab::SeqFormat qw (%reSeqid &SeqidLink);

# symbol export
use Exporter qw(import);
our @EXPORT = qw (
  %BlastDefault &BlastParamStr
  &BlastSections &BlastParse &BlastStructBycall &BlastImg
  &Blast2Html
  );

# package-wide constants and variables
our %LibGlob;
my %_LibParam;


################################################################################
# basics
################################################################################


# environment variables
$ENV{BLASTDB} ||= '/usr/local/GDE/GDEHELP/BLAST';
unless (-d $ENV{BLASTDB}) { 
  printf STDERR "%s. WARNING: BLAST DB directory %s not valid\n", &MySub, $ENV{BLASTDB};
}
$ENV{NCBI} ||= '/gen/fly/biosw/sequin';

# miscellaneous default values
our %BlastDefault = (
  DbHome      => $ENV{BLASTDB} || '.',
  DbDictyAll  => 'dicty.all',
  DbDictyMask => 'dicty.mask',
  DbDictyProt => 'dicty.contig.prot',
  DbStd       => { nucleotide=>'dicty.all', protein=>'swiss' },
  ProgStdNt   => 'blastn2',
  ProgStdProt => 'blastp2',
  ProgCgi     => 'BlastDicty.cgi',
  );


# BLAST sub-program -> sequence types
# column labels: query type, db type, comparison type
$_LibParam{ProgSub2SeqType} = {
   blastn => [ 'nucleotide', 'nucleotide', 'nucleotide', ],
   blastp => [ 'protein',    'protein',    'protein',    ],
   blastx => [ 'nucleotide', 'protein',    'protein',    ],
  tblastn => [ 'protein',    'nucleotide', 'protein',    ],
  tblastx => [ 'nucleotide', 'nucleotide', 'protein',    ],
  };


################################################################################
# BLAST call
################################################################################


# sequence type -> BLAST sub-program
# - this is a 2D index
#   look-up order is: query type, db type
$_LibParam{SeqType2ProgSub} = {
  nucleotide => { nucleotide=>'blastn',  protein=>'blastx' },
  protein    => { nucleotide=>'tblastn', protein=>'blastp' },
  };

# sequence type -> diagnostic BLAST database suffixes
$_LibParam{SeqType2DbSuffix} = {
  protein    => [ qw(bsq xpt psq pal) ],
  nucleotide => [ qw(csq xnt nsq nal) ],
  };


# BLAST program type -> BLAST database type
# - currently, these are 1:1 equations
# - currently, this hash is completely out of use
$_LibParam{ProgType2DbType} = {
  'NCBI 2.0'   => 'NCBI 2.0',
  'WU 2.0'     => 'NCBI 1.4',
  'WU 2.0 lic' => 'XDF 1',
    # we don't really need to restrict use on XDF database type.
    # Type "NCBI 1.4" also works quite nice.
  };

# BLAST database type -> BLAST program type
# - currently, these are 1:1 equations
# - indexed values are used for look-up in $CorePath{call}{blast}
$_LibParam{DbType2ProgType} = {
  'XDF 1'    => 'WU 2.0 lic',
  'NCBI 1.4' => 'WU 2.0',
  'NCBI 2.0' => 'NCBI 2.0',
  };

# BLAST database hierarchy
# - by successive look-up of these database suffixes one will get the first
#   valid sequence type.
# - This implements the following hierarchy:
#   1st  DB type   NCBI 1.4, XDF 1, NCBI 2.0
#   2nd  seq type  nucleotide, protein
# - a partial version of this hierarchy is implemented in
#   $_LibParam{SeqType2DbSuffix}
$_LibParam{DbSuffixHrch} = [ qw(csq bsq xnt xpt nsq nal psq pal) ];

# BLAST database suffixes -> sequence types
# - by hierarchical look-up of database suffixes one will get the first
#   valid sequence type.
# - order hierarchy is:
#   1st  DB type   NCBI 1.4, XDF 1, NCBI 2.0
#   2nd  seq type  nucleotide, protein
$_LibParam{DbSuffix2SeqType} = {
  map { ($_->[0] => $_->[1]) }
  [ 'csq', 'nucleotide' ],
  [ 'bsq', 'protein' ],
  [ 'xnt', 'nucleotide' ],
  [ 'xpt', 'protein' ],
  [ 'nsq', 'nucleotide' ],
  [ 'nal', 'nucleotide' ],
  [ 'psq', 'protein' ],
  [ 'pal', 'protein' ],
  };

# BLAST database suffixes -> database types
# - by hierarchical look-up of database suffixes one will get the first
#   valid database type.
# - order hierarchy is:
#   1st  DB type   NCBI 1.4, XDF 1, NCBI 2.0
#   2nd  seq type  nucleotide, protein
$_LibParam{DbSuffix2DbType} = {
  map { ($_->[0] => $_->[1]) }
  [ 'csq', 'NCBI 1.4' ],
  [ 'bsq', 'NCBI 1.4' ],
  [ 'xnt', 'XDF 1' ],
  [ 'xpt', 'XDF 1' ],
  [ 'nsq', 'NCBI 2.0' ],
  [ 'nal', 'NCBI 2.0' ],
  [ 'psq', 'NCBI 2.0' ],
  [ 'pal', 'NCBI 2.0' ],
  };


# BLAST options: NCBI 1.4 / WU 2.0 -> NCBI 2.0
$_LibParam{Option14to20} = {
  map { ($_->[0] => $_->[1]) }
  [ 'B', '-b' ],
  [ 'E', '-e' ],
  [ 'gapW', undef ],
  [ 'M', '-r' ],
  [ 'matrix', '-M' ],
  [ 'N', '-q' ],
  [ 'Q', '-G' ],
  [ 'R', '-E' ],
  [ 'S', '-f' ],  # does it work in NCBI 2.0?
  [ 'V', '-v' ],
  [ 'W', undef ],  # no free choice in NCBI 2.0
  [ 'X', undef ],
  [ '-gapall', undef ],
  [ '-gi', '-I' ],
  [ '-nogap', '-g' ],
  };


# BLAST parameter sets
# sets are divided according to the two possible comparison types
$LibGlob{ParamSet} = {
  nucleotide => {
    CGI        => { B=> 120, E=>'1E-15',         M=> 6, N=>-12, W=>11, hspmax=> 5 },
    Default    => { B=> 120, E=>'1E-15',         M=> 5, N=> -8, W=>11, hspmax=> 8 },
    DictyMask  => { B=>  15,             S=>200, M=> 6, N=>-10, W=> 9, hspmax=>20 },
    EST        => { B=>2000,             S=>380, M=> 6, N=>-18, W=>18, hspmax=>10 },
    Hidden     => { B=>  50,             S=>140, M=> 5, N=> -8, W=> 6, hspmax=> 5 },
    Identity   => { B=>2000,             S=>500, M=> 5, N=>-15, W=>17, hspmax=> 5 },
    Proj       => { B=> 150,             S=>300, M=> 6, N=>-18, W=>14, hspmax=> 5 },
    ReptFPrint => { B=>  15,             S=>185, M=> 6, N=>-10, W=>10, hspmax=>20 },
    SimCluster => { B=> 500,             S=>220, M=> 6, N=>-18, W=>10, hspmax=> 5 },
    },
  protein => {
    CGI        => { B=> 120, E=>'1E-3',                         W=> 4, hspmax=> 5 },
    Default    => { B=> 120, E=>'1E-3',                         W=> 5, hspmax=> 8 },
    Identity   => { B=>  50, E=>'1E-15',                        W=> 8, hspmax=> 5 },
    Proj       => { B=> 150, E=>'1E-7',                         W=> 5, hspmax=> 5 },
    },
  };


# work out BLAST 2.0 call parameter string from BLAST parameter options
#
# INTERFACE
# - argument 1+: option hash
#
# - functional options:
#   -debug       [STD]
#
# - parameter options:
#   -db          use specified BLAST database (full path, without any suffixes).
#   -DbType      BLAST program type (case-insensitive). Any statement invokes
#                selection on databases of this type.
#   -param       specification of a parameter set to use for BLAST (see library
#                definitions above for available parameter sets). Default set
#                is 'Default'.
#   -program     use specified BLAST program (case-insensitive). This option
#                also influences which default parameter set to use.
#   -ProgType    BLAST program type (case-insensitive). Any statement invokes
#                selection on databases which are supported by this program
#                type.
#   -QueryType   query sequence type: nucleotide (default), protein
#                This information will be used to determine the appropriate
#                BLAST program if it's not specified.
#   -ValB        value for BLAST parameter B, cf. BLAST documentation.
#                Parameter V is automatically set equal to parameter B.
#   -Valcpus     explicitly set maximum number of CPUs used, cf. BLAST
#                documentation
#   -ValE        value for BLAST parameter E, cf. BLAST documentation
#   -Valgapall   BLAST option gapall, cf. BLAST documentation
#   -ValgapW     BLAST option gapW, cf. BLAST documentation
#   -ValgapX     BLAST option gapX, cf. BLAST documentation
#   -Valhspmax   value for BLAST parameter hspmax, cf. BLAST documentation
#   -ValM        value for BLAST parameter M, cf. BLAST documentation
#   -Valmatrix   value for BLAST parameter matrix, cf. BLAST documentation
#   -ValN        value for BLAST parameter N, cf. BLAST documentation
#   -Valnogap    BLAST option nogap: turn off gapped alignments
#   -ValQ        value for BLAST parameter Q, cf. BLAST documentation
#   -ValR        value for BLAST parameter R, cf. BLAST documentation
#   -ValS        value for BLAST parameter S, cf. BLAST documentation
#   -ValS2       value for BLAST parameter S2, cf. BLAST documentation
#   -ValW        value for BLAST parameter W, cf. BLAST documentation
#   -ValX        value for BLAST parameter X, cf. BLAST documentation
#
# - return val:  - array of:
#                  - BLAST program / database specification
#                  - BLAST parameter string
#                  - effective BLAST parameters (hash reference)
#                    in BLAST program-specific syntax
#                  - sequence types (hash reference)
#                - undef if an error occurred
#
# DESCRIPTION
# - the function tries to work out something useful from a minimal set of
#   explicitely supplied parameters, e.g.:
#   - query seq type, BLAST database
# - a default database will be looked up from the hash %BlastDefault:
#   $BlastDefault{DbHome}
#   $BlastDefault{DbStd}
# - NOTE: we have to do quite different things depending on the blast program
#   version that will be used (NCBI or WU). However, the procedure of how
#   to compile a complete parameter string will be the same.
#
# DEBUG, CHANGES, ADDITIONS
# - sort out parameters, that're not supported by the chosen BLAST program,
#     e.g.: Q, R
#   in case of BLAST-type dependent parameters, this is already established:
#     N is nucleotide-specific
#   we need a diagnosis of the chosen BLAST program.
#
sub BlastParamStr {
  my (%opt) = @_;

  # function parameters
  my $debug = $opt{-debug};
  $opt{-DbType} and $opt{-DbType} =~ s/^(\d)/NCBI $1/;
  my %prog;
  $prog{type} = uc($opt{-ProgType}) || $_LibParam{DbType2ProgType}{uc($opt{-DbType})};
  if ($debug) {
    printf STDERR "%s. format type statements:%s\n", &MySub,
      (grep { $_ } map { $opt{$_} } qw(-DbType -ProgramType -QueryType)) ? '':' NONE';
    foreach my $ItParam (grep { $opt{$_} } qw(-DbType -ProgramType -QueryType)) {
      printf STDERR "  %s => %s\n", $ItParam, $opt{$ItParam};
    }
    printf STDERR "%s. program statement: %s\n", &MySub, $opt{-program} || 'NONE';
  }

  ##############################################################################
  # BLAST program, query sequence, BLAST database
  #
  # type    BLAST program type, cf. DESCRIPTION
  # name    explicit specification
  # basic   BLAST sub-program
  # call    BLAST program path that'll be called
  my (%SeqType,$PathDb,$sProgAndDb);

  # BLAST program stated
  if ($prog{name} = $opt{-program}) {

    # enter specification of BLAST program
    # seq types expected for: query, database, comparison
    $prog{name} =~ m/t?blast[npx]/i;
    $prog{basic} = lc ($&);
    if (! $prog{basic} or $prog{basic} eq 'tblastp') {
      printf STDERR "%s. ERROR: unknown BLAST program %s (evaluated to %s)\n", &MySub,
        $prog{name}||"''", $prog{basic}||"''";
      return undef;
    }
    ($SeqType{ExpectQuery}, $SeqType{ExpectDb}, $SeqType{comp}) =
      @{ $_LibParam{ProgSub2SeqType}{$prog{basic}} };

    # check query sequence type
    if ($opt{-QueryType} and $opt{-QueryType} ne $SeqType{ExpectQuery}) {
      printf STDERR "%s. ERROR: query sequence type %s"
        . " doesn't fit to BLAST program %s (expected type: %s)\n", &MySub,
        $opt{-QueryType}, $prog{basic}, $SeqType{ExpectQuery};
      return undef;
    }

    # BLAST database
    unless ($PathDb = $opt{-db}) {
      $debug and printf STDERR "%s. using default %s database %s\n", &MySub,
        $SeqType{ExpectDb}, $PathDb||"''";
    }
    $PathDb = ($PathDb =~ m/(^|[^\\])\//) ? $PathDb : $BlastDefault{DbHome} .'/'. $PathDb;
    unless (@_ = grep { ! $prog{type} or $_LibParam{DbType2ProgType}{$_LibParam{DbSuffix2DbType}{$_}} eq $prog{type} }
            grep { -r "$PathDb.$_" }
            @{$_LibParam{SeqType2DbSuffix}{$SeqType{ExpectDb}}}) {
      printf STDERR "%s. ERROR: unable to find BLAST %s database %s\n", &MySub, $SeqType{ExpectDb}, $PathDb||"''";
      return undef;
    }
    $SeqType{db} = $SeqType{ExpectDb};

    # BLAST program type from first matching suffix
    $prog{type} ||= $_LibParam{DbType2ProgType}{$_LibParam{DbSuffix2DbType}{$_[0]}};
  }

  # derive program from DB and query sequence
  else {

    # check query sequence type
    unless ($SeqType{query} = $opt{-QueryType}) {
      $debug and printf STDERR "%s. sequence type assumed to be nucleotide\n", &MySub;
      $SeqType{query} = 'nucleotide';
    }

    # BLAST database
    unless ($PathDb = $opt{-db}) {
      $debug and printf STDERR "%s. using default (nucleotide) database %s\n", &MySub, $PathDb||"''";
      $PathDb = $BlastDefault{DbHome} .'/'. $BlastDefault{DbStd}{nucleotide};
    }
    unless (@_ = grep { ! $prog{type} or $_LibParam{DbType2ProgType}{$_LibParam{DbSuffix2DbType}{$_}} eq $prog{type} }
            grep { -r "$PathDb.$_" }
            @{$_LibParam{DbSuffixHrch}}) {
      printf STDERR "%s. ERROR: unable to find any BLAST database %s\n", &MySub, $PathDb||"''";
      return undef;
    }
    $SeqType{db} = $_LibParam{DbSuffix2SeqType}{$_[0]};

    # BLAST program type from first matching suffix
    $prog{type} ||= $_LibParam{DbType2ProgType}{$_LibParam{DbSuffix2DbType}{$_[0]}};

    # BLAST program
    $prog{basic} = $prog{name} =
      $_LibParam{SeqType2ProgSub}{$SeqType{query}}{$SeqType{db}};
    unless ($prog{name}) {
      printf STDERR "%s. code ERROR: unable to determine BLAST program, seq type %s, db type %s\n", &MySub,
        $SeqType{query}, $SeqType{db};
      &DataPrint ($_LibParam{SeqType2ProgSub}, -handle=>\*STDERR);
      return undef;
    }
    $SeqType{comp} = (sort { $b cmp $a } $SeqType{query}, $SeqType{db})[0];
  }

  # meanwhile debug
  if ($debug) {
    printf STDERR "%s. type statements/diagnoses:\n", &MySub;
    printf STDERR "  query seq: %s\n", $SeqType{query}||"''";
    printf STDERR "  DB seq: %s\n", $SeqType{db}||"''";
    printf STDERR "  comp. seq: %s\n", $SeqType{comp}||"''";
    printf STDERR "  program: %s %s\n", $prog{basic}||"''", $prog{type}||"''";
    printf STDERR "%s. valid DB suffices: %s\n", &MySub, join ' ', @_;
  }

  # BLAST call, including database
  unless ($prog{call} = $CorePath{call}{blast}{$prog{type}}{lc $prog{name}}) {
    printf STDERR "%s. ERROR: unknown BLAST program %s (type %s)\n", &MySub, $prog{name}||"''", $prog{type}||"''";
    return undef;
  }
  if ($prog{type} eq 'NCBI 2.0') {
    $PathDb =~ s/^/-d /; 
    $PathDb .= ' -i'; 
  }
  $sProgAndDb = "$prog{call} $PathDb";

  ##############################################################################
  # BLAST parameters - value-dependent
  my %param;

  # BLAST parameter set
  if ($opt{-param}) {
    unless (defined $LibGlob{ParamSet}{$SeqType{comp}}{$opt{-param}}) {
      printf STDERR "%s. ERROR: BLAST parameter set %s not defined for %s type comparison\n", &MySub,
        $opt{-param}, $SeqType{comp};
      return undef;
    }
    %param = %{$LibGlob{ParamSet}{$SeqType{comp}}{$opt{-param}}};
  } else {
    %param = %{$LibGlob{ParamSet}{$SeqType{comp}}{Default}};
  }
  if ($opt{-ValS}) { delete $param{E} }
  if ($opt{-ValE}) { delete $param{S} }

  # add/overwrite BLAST parameters from settings via function options
  foreach (qw(B cpus E E2 hspmax M matrix N Q R S S2 T W X)) {
    if (exists($opt{'-Val'.$_}) and $opt{'-Val'.$_}) {
      $param{$_} = $opt{'-Val'.$_};
    }
  }

  # ergonomic defaults for E2, S2
  if (exists($param{E}) and $param{E}) {
    $param{E2} ||= &Min (10, $param{E}*100);
  }
  if (exists($param{S})) {
    $param{S2} ||= $param{S} * 0.78;
  }

  # for comparison type protein, parameter N is not allowed
  if ($SeqType{comp} eq 'protein') { delete $param{N} }

  # work out cross-wise depending BLAST parameters
  $param{V} = $param{B};

  # integer value BLAST parameters
  foreach (grep { exists($param{$_}) and $param{$_} }
    qw(B cpus gapW gapX hspmax M N Q R S S2 T V W X)) {
    $param{$_} = int ($param{$_});
  }

  ##############################################################################
  # BLAST parameters - boolean options

  # boolean Options
  foreach (qw(gapall nogap)) {
    if ($opt{'-Val'.$_}) { $param{'-'.$_}=1 }
  }

  # direct match sorting
  $param{$param{S}?'-sort_by_highscore':'-sort_by_pvalue'} = 1;

  # -gi always
  $param{-gi} = 1;

  ##############################################################################
  # BLAST parameter string
  my $sParam;

  # BLAST NCBI 2.0 parameter syntax
  # - translate parameter string
  # - prepare parameter string
  if ($prog{type} eq 'NCBI 2.0') {
    foreach (keys %param) {
      unless ($param{$_} and $_LibParam{Option14to20}{$_}) { next }
      if (m/^-nogap$/) {
        $param{$_} = 'F';
      }
      if (m/^-/ and $_ !~ m/-nogap/) {  # boolean switch
        $sParam .= " $_LibParam{Option14to20}{$_} T";
      } else {
        $sParam .= " $_LibParam{Option14to20}{$_} $param{$_}";
      }
    }
  }

  # BLAST NCBI 1.4 / BLAST WU 2.0 parameter syntax
  # - prepare parameter string
  elsif ($prog{type} eq 'WU 2.0' or 1) {
    foreach (keys %param) {
      $param{$_} or next;
      if (m/^-/) {  # boolean switch
        $sParam .= " $_";
      } else {
        $sParam .= " $_=$param{$_}";
      }
    }
  }

  # return
  if ($debug) {
    printf STDERR "%s. final strings:\n", &MySub;
    printf STDERR "  program & db: %s\n", $sProgAndDb;
    printf STDERR "  parameters: %s\n", $sParam;
  }
  return ($sProgAndDb, $sParam, \%param, \%SeqType);
}


################################################################################
# BLAST report parsing and interpretation
################################################################################


# extract three header/footer sections from a BLAST report (plain or HTML)
#
# INTERFACE
# - argument 1: filename for BLAST report (input file)
# - return val: array of BLAST report sections: (query, program, database)
#
# DEBUG, CHANGES, ADDITIONS
# - also return report sub-sections: query name, warnings etc.
# - return hash data structure
#
sub BlastSections {
  my $StartQuery = 'Query=';
  my $PathIn = shift;

  # open BLAST report for input
  my $hIn = FileHandle->new($PathIn);
  unless ($hIn) {
    die sprintf "%s. ERROR: unable to open BLAST report file %s for input\n", &MySub, $PathIn||"''";
  }

  # read report header and split to sections
  # read until HSP index
  my $buffer;
  while (<$hIn>) {
    if (m/[\. ]+done$/) { last }
    $buffer .= $_;
  }
  my ($SectQuery,$SectProg,$SectDb);
  ($SectProg,$buffer)   = split (/\n\n$StartQuery/o, $buffer);
  ($SectQuery,$SectDb) = split (/\n\n/, $StartQuery . $buffer);

  # return
  return ($SectQuery,$SectProg,$SectDb);
}


# parsed BLAST report (data structure)
#
# DESCRIPTION
#
# %Report
#   root data structure of return value of &BlastParse (%BlMain during
#   processing in &BlastParse)
#   Hash containing framing info about the query, as found in the BLAST report.
#
#   SrcPath       file path of BLAST report
#   program       BLAST program
#   CompType      sequence type of BLAST comparison, either 'nucleotide' or
#                 'protein'
#   DbLabel       label of BLAST database, may be equal to DbPath
#   DbPath        file path of BLAST database
#   DbName        filename of BLAST database
#   fatal         array of message paragraphs for fatal errors
#   FlagRepeat    a random fatal error occured (some versions of WU-BLAST 2.0).
#                 Just repeat the BLAST process.
#   FlagCritical  a random critical error occured. No matches, no message
#                 '*** NONE ***', and no fatal errors.
#   FlagNoHit     no hit at all in the BLAST report
#   warn          array of message paragraphs for warnings
#   QuerySeq      attributes of query sequence
#     id          identifier
#     descr       description, as parsed from the reported fastA header
#     length      total sequence length
#   Match         reference to hash of matches (key is match ID), see %match
#   MatchNum      number of reported matches. Note, that this may a limit set
#                 by the BLAST parameter B.
#   HighestScore  highest reached match score
#
# %Match
#   anchored as %{$report{match}{$id}} ($BlMatch{$id} during processing in
#   &BlastParse)
#   Representation of matching database entry with following hash keys:
#
# ( complex       highest complexity value of all HSPs
#   descr         description of the match entry
#   expect        lowest expectancy value of all HSPs.
#   header        complete header line of the entry. The string contains
#                 a '>' character at it's beginning.
#   id            redundant info (cf. anchoring of %Match), but the %Match hash
#                 may be handled separate from the %Report hash
#   length        total length of the matching sequence from the database
#                 (don't confuse with the length of a HSP).
#   HSP           array of %HSP hashes for all HSPs belonging to that
#                 matching entry (see description below). HSPs are
#                 sorted by score value (highest first).
#   MaxHspLen     maximum length of all HSPs (measured on query sequence)
#   QueryBeg      query sequence start of the alignment range
#   QueryEnd      query sequence end of the alignment range
#   QueryLen      length of query sequence in the alignment range
#   RelId         highest rel. identity value of all HSPs
#   RelPos        highest rel. positives value of all HSPs
#   score         highest score value of all HSPs
#
# %HSP
#   anchored as %{$report{match}{$id}{HSP}[$num]}
#   Representation of an HSP with following hash keys:
#
# ( align         reference to alignment data structure, cmp. SeqAlign::Align.
#                 This structure is not built by default in &BlastParse, cmp.
#                 function options. Substructure:
#                   $pAln->[0]{sequence}
#                   $pAln->[1]{sequence}
#                   $pAln->[2]{sequence}
# ( complex       alignment complexity value
#   expect        expect value for the HSP
#   MatchR        back-reference to match data node, see %Match
# - MatchSeg      HSP segment positions for matching sequence. This is a
#                 temporary field throughout &BlastParse which finally yields
#                 the values for keys 'MatchBeg' and 'MatchEnd'.
#   MatchBeg      matching sequence start of the HSP
#   MatchEnd      matching sequence end of the HSP
#   MatchLen      length of matching sequence in the HSP. Note that the
#                 sequence type of query and match may be different so that the
#                 length is different by a factor of 3!
#   MatchFr       translation frame of matching sequence (for TBLAST only).
#                 Note the -tblastn option in &BlastParse!
#   orient        relative orientation of sequence pair in HSP
# - QuerySeg      HSP segment positions for query sequence. This is a
#                 temporary field throughout &BlastParse which finally yields
#                 the value for keys 'QueryBeg' and 'QueryEnd'.
#   QueryBeg      query sequence start of the HSP
#   QueryEnd      query sequence end of the HSP
#   QueryLen      length of query sequence in the HSP. Note that the
#                 sequence type of query and match may be different
#                 so that the length is different by a factor of 3!
#   QueryFr       translation frame of query sequence (for TBLAST only)
#                 Note the -tblastn option in &BlastParse!
#   RelId         relative identity in HSP (0 < RelId < 1)
#   RelPos        relative positives in HSP (0 < RelPos < 1). This is
#                 informative for protein BLAST only but given also for
#                 nucleotide BLAST.
#   score         HSP score
#
# example: $$pReport{Match}{'P03394'}{HSP}[0]{orient}
#            means
#          relative strandedness in highest scoring HSP for match between query
#          sequence and database sequence 'P03394'.
#


# parse BLAST to data structure
#
# INTERFACE
# - argument 1: source:
#               - path of source file (BLAST report)
#               - reference to file handle (typeglob reference)
#
# - options:
#   -align      parse alignments
#               0  no alignment parsing
#               1  parse alignment for the first (best) HSP only
#                  this may effectively evaluate to mode 2 if option -complex
#                  is set additionally
#               2  parse alignment for all HSPs
#   -complex    calculate HSP complexity
#   -debug      [STD]
#
# - return val: - reference to data structure
#               - undef if an error occurred
#
# DESCRIPTION
# - for BLAST report data structure see descriptions above.
#
sub BlastParse {
  my ($ArgIn,%opt) = @_;
  my $debug = $opt{-debug};

  my $hInBl;
  my %BlMain;
  if (ref($ArgIn) =~ m/\b(FileHandle|GLOB)/) {
    $hInBl = $ArgIn;
    $debug and printf STDERR "%s. opened BLAST report handle %s\n", &MySub, ref($ArgIn);
    $BlMain{SrcPath} = undef;
  } else {
    unless ($hInBl = FileHandle->new($ArgIn)) {
      die sprintf "%s. ERROR: unable to open input file %s\n", &MySub, $ArgIn||"''";
    }
    $debug and printf STDERR "%s. opened BLAST report file %s\n", &MySub, $ArgIn||"''";
    $BlMain{SrcPath} = $ArgIn;
  }

  ##############################################################################
  # pre-work, loop control
  my (%BlMatch,$CurrSeqid,$pHsp,$orient,$CtHsp,$CtHspInblock);

  # very first header line
  my $line = <$hInBl>;
  if ($line =~ m/^\w*BLAST\w*/) {
    $BlMain{program} = lc $&;
    $BlMain{CompType} = $_LibParam{ProgSub2SeqType}{$BlMain{program}}[2];
    $BlMain{CompAlphSize} = ($BlMain{CompType} eq 'protein') ? 20 : 4;
  } else {
    # probably, no (more) report
    $debug and printf STDERR "%s. failed to grab BLAST program name, line %d:\n%s", &MySub,
      $., $line;
    return undef;
  }

  # line loop
  while (defined($line=<$hInBl>)) {
    $debug and printf STDERR "%s. line %d\n", &MySub, $.;

  ##############################################################################
  # report header section

    # pick up query seq path/name and length
    if ($line =~ m/^\s*Query=\s*(.*?)(?:\s+(.+?))?\s*$/) {
      chomp $line;
      $BlMain{QuerySeq} = { id=>$1, descr=>$2 };
      while (defined($line=<$hInBl>) and $line!~m/^\s*$/ and $line!~m/\s+letters[;\)]/ and $line!~m/^Length\s*=\s*(\d+)$/) {
        chomp $line;
        $line =~ s/^\s*/ /;
        $BlMain{QuerySeq}{descr} .= $line;
      }
      if ($line =~ m/^\s+\(([0-9,]+)\s+letters\)/) {
        $BlMain{QuerySeq}{length} = $1;
        $BlMain{QuerySeq}{length} =~ s/,//g;
      } elsif ($line =~ m/^Length\s*=\s*(\d+)$/) {
        $BlMain{QuerySeq}{length} = $1;
      }
    }

    # pick up database path/name
    elsif ($line =~ m/^Database:\s+(.*)$/) {
      $BlMain{DbLabel} = $1;
      $debug and printf STDERR "%s. got database label: %s\n", &MySub,
        $BlMain{DbLabel}||"''";
    }

  ##############################################################################
  # match header

    # first line of BLAST match header:
    # - current sequence ID
    #   replace complex GenBank-fashion sequence ID by Acc.No.
    # - full header
    elsif ($line =~ m/^>(\S+)/) {
      $CurrSeqid = $1;
      $CtHsp = -1;
      $CtHspInblock = -1;
      my $header = $line;
      if (exists($BlMatch{$CurrSeqid}) and exists($BlMatch{$CurrSeqid}{HSP})) {
        printf STDERR "%s. WARNING: double HSP report segment for sequence $CurrSeqid\n", &MySub;
      }
      $debug and printf STDERR "%s. entered HSP report for sequence $CurrSeqid\n", &MySub;

      # parse rest of header of BLAST match entry
      while (defined ($line=<$hInBl>)) {

        # match to 'Length' entry in header of alignment pair segment
        # => end of match header
        # -  final work for header: split to descr, ...
        if ($line =~ m/^\s*Length\s*=\s*([\d,]+)$/) {
          $BlMatch{$CurrSeqid}{id} = $CurrSeqid;
          $BlMatch{$CurrSeqid}{length} = $1;
          $BlMatch{$CurrSeqid}{length} =~ tr/,//d;
          $header =~ s/[\n\r ]+/ /g;
          my $descr = substr ($header, index ($header, ' ') + 1, length $header);
          $BlMatch{$CurrSeqid}{header} = $header;
          $BlMatch{$CurrSeqid}{descr} = $descr;
          if ($debug) {
            printf STDERR "%s. header info for new match:\n", &MySub;
            printf STDERR "  ID: %s\n", $CurrSeqid;
            printf STDERR "  length: %d\n", $BlMatch{$CurrSeqid}{length};
            printf STDERR "  description: %s\n", $BlMatch{$CurrSeqid}{descr};
          }
          last;
        }

        # continued description
        else {
          $header .= $line;
        }
      }
    }

  ##############################################################################
  # HSP header

    # HSP header, first line: score sum
    elsif ($line =~ m/^\s*Score = +(\d+) \([\d\.]+ bits\), +Expect = ([\d\.eE+-]+)/ or  # NCBI 1.4?, WU 2.0
           $line =~ m/^\s*Score = +[\d\.]+ bits \((\d+)\), +Expect = ([\d\.eE+-]+)/     # NCBI 2.0
    ) {
      $CtHsp ++;
      $CtHspInblock ++;
      $pHsp = $BlMatch{$CurrSeqid}{HSP}[$CtHsp] ||= {};
      $$pHsp{score}  = $1;
      $$pHsp{expect} = $2;
      $$pHsp{orient} = $orient || 1;
        # this takes effect only for WU-BLAST, possibly overwritten by a value
        #   derived from frame values
        # it definitely gets overwritten for NCBI BLAST
    }

    # HSP header, second line: identities, positives, frame(s)
    elsif ($line =~ m|^\s*Identities = (\d+)/(\d+) \(.*?\)(, Positives = (\d+)/(\d+) \(.*?\))?(, Frame = ([-+]\d))?( / ([-+]\d))?|) {
      $$pHsp{RelId} = $1 / $2;
      $$pHsp{RelPos} = length($4||'') ? $4/$5 : 0;
      if (length($7||'')) { $$pHsp{QueryFr} = $7 };
      if (length($9||'')) { $$pHsp{MatchFr} = $9 };
      # correctly assign query for TBLASTN
      if ($BlMain{program} =~ m/tblastn/i) {
        $$pHsp{MatchFr} = $$pHsp{QueryFr};
        delete $$pHsp{QueryFr};
      }
      # override error in WU TBLASTX
      if   (exists($$pHsp{MatchFr}) and $$pHsp{MatchFr}
        and exists($$pHsp{QueryFr}) and $$pHsp{QueryFr}
      ) {
        $$pHsp{orient}
          = (&Sign($$pHsp{MatchFr}) == &Sign($$pHsp{QueryFr})) ? 1 : -1;
      }
      $debug and printf STDERR "%s. entered HSP no. %d, score %d, expect %s, rel. identity %f\n", &MySub,
        $CtHsp, $$pHsp{score}, $$pHsp{expect}||"''", $$pHsp{RelId};
    }

    # HSP header, third line: strandedness (NCBI)
    #  or
    # strandedness section separator (WU)
    elsif ($line =~ m/^\s+(Plus|Minus) Strand HSPs:(\n|$)/ or
           $line =~ m/^\s+Strand = Plus \/ ((Plus|Minus))(\n|$)/) {
      $orient = ($1 eq 'Plus') ? 1 : -1;
      $CtHspInblock = (length($2)>2) ? ++$CtHspInblock : -1;
      if (length($2)>2) { $$pHsp{orient} = $orient }
      $debug and printf STDERR "%s. strandedness set/switched to %s\n", &MySub,
        ($orient>0)?'plus':'minus';
    }

  ##############################################################################
  # HSP alignment

    # match to each first line of ALIGNMENT triple-line
    elsif ($line =~ m/^\s*Query:? +(\d+) +([a-zA-Z\-\*]+) +(\d+)/) {

      # add HSP segment positions (query sequence) to array for current HSP
      push @{$$pHsp{QuerySeg}}, $1, $3;

      # enter alignment portion
      # ($CtHspInblock==0) signals first HSP of orientation block, possibly
      #   overall highest scoring
      my $bEnterAlign = (($opt{-align} and ($opt{-align}>1 or !$CtHspInblock)) or $opt{-complex});
      $debug and printf STDERR "%s. %s alignment segment,"
        . " query seq range $1..$3\n", &MySub, $bEnterAlign?'entered':'parsed';
      if ($bEnterAlign) {
        $$pHsp{align}[0]{sequence} .= $2;
        $line = <$hInBl>;
        $$pHsp{align}[1]{sequence} .= substr ($line, -length($2)-1, length($2));
      }
    }

    # match to each last line of ALIGNMENT triple-line
    elsif ($line =~ m/^\s*Sbjct:? +(\d+) +([a-zA-Z\-\*]+) +(\d+)/) {

      # add HSP segment positions (matching sequence) to array of current HSP
      push @{$$pHsp{MatchSeg}}, $1, $3;

      # enter alignment portion
      # ($CtHspInblock==0) signals first HSP of orientation block, possibly
      #   overall highest scoring
      if (($opt{-align} and ($opt{-align}>1 or !$CtHspInblock)) or $opt{-complex}) {
        $$pHsp{align}[2]{sequence} .= $2;
      }
    }

  ##############################################################################
  # other stuff to parse beside matches

    # warnings
    elsif ($line =~ m/^WARNING:  /) {
      my $buffer = $line;
      while (defined ($line=<$hInBl>) and $line!~m/^$/ and $line!~m/^Searching/) {
        $buffer .= $line;
      }
      push @{$BlMain{warn}}, $buffer;
    }

    # fatal errors
    elsif ($line =~ m/^FATAL:  /) {
      my $buffer = $line;
      while (defined($line=<$hInBl>) and $line!~m/^$/ and $line!~m/^Searching/) {
        $buffer .= $line;
      }
      push @{$BlMain{fatal}}, $buffer;
      if ($buffer =~ m/^FATAL: +.*(ExpandX|WordFinderSearch)/) {
        $BlMain{FlagRepeat} = 1;
      }
    }

    # no hits
    elsif ($line =~ m/^\s+[*]{3} NONE [*]{3}/) {
      $BlMain{FlagNoHit} = 1;
    }

  ##############################################################################
  # trailing statistics statements, end of report

    # warnings
    elsif ($line =~ m/^Statistics:/) {
      $debug and printf STDERR "%s. reached statistics block, line %s\n", &MySub, $.;
      scalar <$hInBl>;
      while (defined($line=<$hInBl>) and $line!~m/^(\x0c|\/\/|\n)/) {
        
        # database specification
        if ($line =~ m/^  Database:\s+(\S+)/) {
          $BlMain{DbPath} = $1;
          $debug and printf STDERR "%s. got database path: %s\n", &MySub,
            $BlMain{DbPath}||"''";
          if ($BlMain{DbPath} =~ m|^.*/(.*)$|) {
            $BlMain{DbName} = $1;
          } else {
            $BlMain{DbName} = $BlMain{DbPath};
          }
        }
      }
      if (defined($line)) { redo } else { last }
    }

    # end of report
    elsif ($line =~ m/^(\/\/|\x0c)/) {
      $debug and printf STDERR "%s. found end of BLAST report, line %s\n", &MySub, $.;
      last;
    }

  } # end while for line loop

  ##############################################################################
  # final work on data structure

  # critical error
  if (!keys(%BlMatch) and !$BlMain{FlagNoHit} and !$BlMain{fatal}) {
    $BlMain{FlagCritical} = 1;
  }

  # loop over all entries again
  foreach $CurrSeqid (keys %BlMatch) {
    $debug and printf STDERR "%s. refinement, match $CurrSeqid\n", &MySub;

    # loop over HSPs
    foreach $pHsp (@{$BlMatch{$CurrSeqid}{HSP}}) {

      # derive HSP offsets and ends
      $$pHsp{MatchR} = $BlMatch{$CurrSeqid};
      $$pHsp{QueryBeg} = &Min (@{$$pHsp{QuerySeg}});
      $$pHsp{QueryEnd} = &Max (@{$$pHsp{QuerySeg}});
      $$pHsp{QueryLen} = $$pHsp{QueryEnd} - $$pHsp{QueryBeg} + 1;
      $$pHsp{MatchBeg} = &Min (@{$$pHsp{MatchSeg}});
      $$pHsp{MatchEnd} = &Max (@{$$pHsp{MatchSeg}});
      $$pHsp{MatchLen} = $$pHsp{MatchEnd} - $$pHsp{MatchBeg} + 1;
      delete $$pHsp{MatchSeg};
      delete $$pHsp{QuerySeg};

      # letter complexity of alignment
      if ($opt{-complex}) {
        my $HspMatchlett = ($$pHsp{align}[1]{sequence} =~ m/[a-zA-Z]/) ?
          $$pHsp{align}[1]{sequence} : uc($$pHsp{align}[0]{sequence});
        $HspMatchlett =~ s/\W+//g;
        $HspMatchlett or next;
        my %HspMatchlettIndex;
        map { $HspMatchlettIndex{$_}++ } split(//,$HspMatchlett);
        $$pHsp{complex} = - &Sum (map {
          $_ / length($HspMatchlett) * log($_/length($HspMatchlett))
          } values %HspMatchlettIndex);
        # / $BlMain{CompAlphSize};
        $opt{-align} or delete $$pHsp{align};
      }
    }

    # sort HSPs of each enty by score descending
    @{$BlMatch{$CurrSeqid}{HSP}} = sort {
      $b->{score} <=> $a->{score} or
      ($a->{QueryLen}+$a->{MatchLen}) <=> ($b->{QueryLen}+$b->{MatchLen}) or
      $a cmp $b } @{$BlMatch{$CurrSeqid}{HSP}};

    # match start, end, length
    $BlMatch{$CurrSeqid}{QueryBeg} = &Min (map { $_->{QueryBeg} } @{$BlMatch{$CurrSeqid}{HSP}});
    $BlMatch{$CurrSeqid}{QueryEnd} = &Max (map { $_->{QueryEnd} } @{$BlMatch{$CurrSeqid}{HSP}});
    $BlMatch{$CurrSeqid}{QueryLen} = $BlMatch{$CurrSeqid}{QueryEnd} - $BlMatch{$CurrSeqid}{QueryBeg} + 1;

    # match highest score, highest expectancy, highest identity
    $opt{-complex} and
    $BlMatch{$CurrSeqid}{complex}   = &Max (map { $_->{complex} }  @{$BlMatch{$CurrSeqid}{HSP}});
    $BlMatch{$CurrSeqid}{expect}    = &Min (map { $_->{expect} }   @{$BlMatch{$CurrSeqid}{HSP}});
    $BlMatch{$CurrSeqid}{RelId}     = &Max (map { $_->{RelId} }    @{$BlMatch{$CurrSeqid}{HSP}});
    $BlMatch{$CurrSeqid}{RelPos}    = &Max (map { $_->{RelPos} }   @{$BlMatch{$CurrSeqid}{HSP}});
    $BlMatch{$CurrSeqid}{MaxHspLen} = &Max (map { $_->{QueryLen} } @{$BlMatch{$CurrSeqid}{HSP}});
    $BlMatch{$CurrSeqid}{score}     = &Max (map { $_->{score} }    @{$BlMatch{$CurrSeqid}{HSP}});
  }

  # highest score reached
  $BlMain{HighestScore} = &Max (map { $_->{score} } values %BlMatch);

  # combine data structures
  $BlMain{MatchNum} = int keys %BlMatch;
  $BlMain{Match} = \%BlMatch;

  # report parsed structure
  # - &DataPrint(\%BlMain) could pose problems in readability - we have cross-
  #   references inside the data structure! You can try program
  #   Blast.pl -data <seqFile> instead.
  if ($_ = (grep { !exists($_->{id}) } values %{$BlMain{Match}})[0]) {
    printf STDERR "%s. WARNING: NULL entry in list of matches\n", &MySub;
    printf STDERR "%s. warnings: %d\n", &MySub, int @{$BlMain{warn}};
    printf STDERR "%s. fatal: %d\n", &MySub, int @{$BlMain{fatal}};
  }

  # exit SUB
  return \%BlMain;
}


# framework to do BLAST call and parse BLAST result structure
#
# INTERFACE
# - argument 1: BLAST call string
#
# - options:
#   -debug      [STD]
#   -WarnFatal  report remaining fatal errors to STDERR
#   ...         and all of &BlastParse
#
# - return val: - reference to BLAST result structure
#               - undef if an error occurs
#                 - unable to start BLAST process
#                 - undef from &BlastParse
#
# DESCRIPTION
# - BLAST's STDERR is redirected to /dev/null. Errors are determined from the
#   result output (STDOUT, read by &BlastParse).
#
sub BlastStructBycall {
  my ($CallBlast, %opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2  = $debug ? $debug-1 : 0;

  my ($hInBl,$pBlParse,$bCritical);
  { # redo block:
    # invoke BLAST process

    # get input handle for BLAST process
    $debug and printf STDERR "%s. calling command '$CallBlast'\n", &MySub;
    unless ($hInBl = FileHandle->new("$CallBlast 2>/dev/null |")) {
      printf STDERR "%s. ERROR: unable to start BLAST process\n  '$CallBlast'\n", &MySub;
      return undef;
    }

    # parse BLAST report
    $pBlParse = &BlastParse ($hInBl, %opt);
    unless ($pBlParse) {
      $debug and printf STDERR "%s. ERROR: unable to parse BLAST report\n", &MySub;
      return undef;
    }

    # handle fatal errors
    if (exists $$pBlParse{fatal} and @{$$pBlParse{fatal}}) {

      # stupid random error
      if ($$pBlParse{FlagRepeat}) {
        $debug and printf STDERR "%s. BLAST random runtime error, query ID %s, repeating BLAST search\n", &MySub,
          $$pBlParse{QuerySeq}{id}||"''";
        redo;
      }

      # defined fatal error
      if ($$pBlParse{fatal} and ($debug or $opt{-WarnFatal})) {
        printf STDERR "%s. fatal BLAST error, query ID %s\n", &MySub,
          $$pBlParse{QuerySeq}{id}||"''";
        print  STDERR (join ('', @{$$pBlParse{fatal}}));
      }
    }

    # stupid random drop-down
    elsif ($$pBlParse{FlagCritical} and !$bCritical) {
      $debug and printf STDERR "%s. BLAST random drop-down error, query ID %s, repeating BLAST search\n", &MySub,
        $$pBlParse{QuerySeq}{id}||"''";
      $bCritical = 1;
      redo;
    }

    # no error at all
    elsif ($debug) {
      printf STDERR "%s. query ID %s: found %d primary matching sequences in BLAST report\n", &MySub,
        int(keys %{$$pBlParse{Match}}), $$pBlParse{QuerySeq}{id}||"''";
    }
  } # BLAST block

  # return result
  return $pBlParse;
}


# draws a schematic png image for a given BLAST report
#
# INTERFACE
# - argument 1: the input argument is treted like in &BlastParse
#               - filename for BLAST report (input file) or
#               - process call or
#               - reference to file handle (typeglob reference)
#
# - options:
#   -debug      print debug report to STDERR
#   -PathOut    directory path and filename stem for png and map output. As
#               default 'BlastImg###' plus ending (.png and .map) is used.
#
# - return val: - array of:
#                 - complete filename for built png or
#                 - complete filename for image map
#                 - reference to data structure of parsed BLAST report
#               - undef if an error occurred
#
# DESCRIPTION
# - BLAST report ist parsed via &BlastParse.
#
# DEBUG, CHANGES, ADDITIONS
# - in the image's bottom line the short sequence ID (if available) should be
#   displayed instead of the long sequence ID. Therefore, the ID split
#   feature in &SeqLab::SeqFormat::SeqentryPopFasta should be modularized
#   and implemented here.
#
sub BlastImg {

  # drawing constants
  my %img;
  $img{dim}{space} = 10;
  $img{xDim}{sum} = 632;
  $img{yDim}{MaxIdChar} = 25;
  $img{xDim}{Font1} = 7;
  $img{yDim}{MaxEntry} = 200;
  $img{yDim}{Entry} = 10;
  $img{yDim}{Font2} = 6;
  $img{yDim}{Grid} = 5;
  $img{yDim}{ColLeg} = 12;
  # file I/O constants
  my $PhraseTemp = 'BlastImg###';

  # function parameters
  my ($ArgIn,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2  = $debug ? $debug-1 : 0;
  $debug and printf STDERR "%s. entered SUB\n", &MySub;

  # parse BLAST report, get BLAST report data structure
  my $pBlParse = &BlastParse($ArgIn,-debug=>$dbg2);
  unless ($pBlParse) {
    die sprintf "%s. ERROR: unable to parse BLAST report %s\n", &MySub, $ArgIn||"''";
  }
  unless (%{$$pBlParse{Match}}) {
    $debug and printf STDERR "%s. ERROR: no sequences parsed from BLAST report, nothing to do\n", &MySub;
    return (undef, undef, $pBlParse);
  }
  if ((grep { not exists $_->{id} } values %{$$pBlParse{Match}}) and 1) {
    printf STDERR "%s. WARNING: NULL entry in list of matches\n", &MySub;
  }

  # sample BLAST matches
  my $QueryId = substr ($$pBlParse{QuerySeq}{id}, 0, $img{xDim}{MaxIdChar});
  my @BlMatch = sort {
    $b->{score} <=> $a->{score} or -1;
    } values %{$$pBlParse{Match}};
  $debug and printf STDERR "%s. %d matching sequences in BLAST report data structure\n", &MySub, int @BlMatch;
  $#BlMatch = &Min ($#BlMatch, $img{yDim}{MaxEntry} - 1);

  #############################################################################
  # dimensioning and start of image

  # Y dimension
  $img{yDim}{sum} = int (@BlMatch)
    * $img{yDim}{Entry} + 6*$img{dim}{space} + 5*$img{yDim}{Entry} + $img{yDim}{ColLeg};

  # X dimension and positions
  # - get maximum identifier length
  # - define alignment start
  $img{xDim}{IdChar} = &Min ($img{yDim}{MaxIdChar},
    &Max (map{ length($_) } 'Color Legend:', $QueryId, map{ $_->{id} } @BlMatch));
  $img{xDim}{Id} = $img{xDim}{IdChar} * $img{xDim}{Font1};
  # $img{xPos}{AlnOff} is the x start point of the alignment block
  $img{xPos}{AlnOff} = 2*$img{dim}{space} + $img{xDim}{Id};
  $img{xDim}{Aln} = $img{xDim}{sum} - $img{xPos}{AlnOff} - $img{dim}{space};

  # start creating the image
  $img{obj} = new GD::Image ($img{xDim}{sum}, $img{yDim}{sum});
  $img{obj}->interlaced('true');

  # allocate colors
  foreach (qw(white black grey red violet blue green yellow)) {
    $img{pal}{$_} = $img{obj}->colorAllocate (@{$ColorLib{$_}});
  }

  # put a black frame around the picture
  $img{obj}->rectangle (0, 0, $img{xDim}{sum}-1, $img{yDim}{sum}-1, $img{pal}{black});

  #############################################################################
  # draw score color legend

  # current Y coordinates is $img{yPos}{'0'} and $img{yPos}{'1'} in pixels
  $img{yPos}{'1'} = $img{yDim}{sum} - $img{dim}{space} - $img{yDim}{Entry};    # text
  $img{yPos}{'0'} = $img{yPos}{'1'} - 0.5*$img{dim}{space} - $img{yDim}{ColLeg};  # color block

  # draw score color legend
  $img{grid}{StepCt} = $img{xDim}{Aln} / 5;
  $img{obj}->string (gdMediumBoldFont, $img{dim}{space}, $img{yPos}{'0'}, 'Color Legend:', $img{pal}{blue});
  $img{obj}->string (gdMediumBoldFont, $img{xPos}{AlnOff}+34+0*$img{grid}{StepCt}, $img{yPos}{'1'}, 'S<=50',      $img{pal}{black});
  $img{obj}->filledRectangle ($img{xPos}{AlnOff}+0*$img{grid}{StepCt}, $img{yPos}{'0'}, $img{xPos}{AlnOff}+1*$img{grid}{StepCt}, $img{yPos}{'0'}+$img{yDim}{ColLeg}, $img{pal}{blue});
  $img{obj}->string (gdMediumBoldFont, $img{xPos}{AlnOff}+18+1*$img{grid}{StepCt}, $img{yPos}{'1'}, '50<S<=100',  $img{pal}{black});
  $img{obj}->filledRectangle ($img{xPos}{AlnOff}+1*$img{grid}{StepCt}, $img{yPos}{'0'}, $img{xPos}{AlnOff}+2*$img{grid}{StepCt}, $img{yPos}{'0'}+$img{yDim}{ColLeg}, $img{pal}{grey});
  $img{obj}->string (gdMediumBoldFont, $img{xPos}{AlnOff}+14+2*$img{grid}{StepCt}, $img{yPos}{'1'}, '100<S<=150', $img{pal}{black});
  $img{obj}->filledRectangle ($img{xPos}{AlnOff}+2*$img{grid}{StepCt}, $img{yPos}{'0'}, $img{xPos}{AlnOff}+3*$img{grid}{StepCt}, $img{yPos}{'0'}+$img{yDim}{ColLeg}, $img{pal}{yellow});
  $img{obj}->string (gdMediumBoldFont, $img{xPos}{AlnOff}+16+3*$img{grid}{StepCt}, $img{yPos}{'1'}, '150<S<=200', $img{pal}{black});
  $img{obj}->filledRectangle ($img{xPos}{AlnOff}+3*$img{grid}{StepCt}, $img{yPos}{'0'}, $img{xPos}{AlnOff}+4*$img{grid}{StepCt}, $img{yPos}{'0'}+$img{yDim}{ColLeg}, $img{pal}{green});
  $img{obj}->string (gdMediumBoldFont, $img{xPos}{AlnOff}+35+4*$img{grid}{StepCt}, $img{yPos}{'1'}, 'S>200',      $img{pal}{black});
  $img{obj}->filledRectangle ($img{xPos}{AlnOff}+4*$img{grid}{StepCt}, $img{yPos}{'0'}, $img{xPos}{AlnOff}+5*$img{grid}{StepCt}, $img{yPos}{'0'}+$img{yDim}{ColLeg}, $img{pal}{red});

  #############################################################################
  # draw scale corresponding to query sequence
  my $reallength;

  # define brushes for grid drawing
  my %BrushPal;
  my $BrushRed = new GD::Image (1, $img{yDim}{Grid});
  foreach ('white', 'black', 'red') {
    $BrushPal{$_} = $BrushRed->colorAllocate (@{$ColorLib{$_}});
  }
  $BrushRed->transparent ($BrushPal{white});
  $BrushRed->line (0,0,0,5, $BrushPal{red});
  my $BrushBlack = new GD::Image (1, $img{yDim}{Grid});
  foreach ('white', 'black', 'red') {
    $BrushPal{$_} = $BrushBlack->colorAllocate (@{$ColorLib{$_}});
  }
  $BrushBlack->transparent ($BrushPal{white});
  $BrushBlack->line (0,0,0,5, $BrushPal{black});

  # query seq name in the height of the query sequence scale
  $img{yPos}{'1'} = $img{yPos}{'0'} - 1.5*$img{dim}{space} - $img{yDim}{Entry};  # bottom
  $img{yPos}{'0'} = $img{dim}{space};                                                  # top
  foreach ($img{yPos}{'0'}, $img{yPos}{'1'}) {
    $img{obj}->string (gdMediumBoldFont, $img{dim}{space}, $_, $QueryId, $img{pal}{red});
  }

  # filled rectangle for query sequence scale
  $img{yPos}{'0'} += 0;                   # top
  $img{yPos}{'1'} += $img{yDim}{Entry};   # bottom
  $img{yPos}{'1'} -= $img{yDim}{Grid};
  foreach ($img{yPos}{'0'}, $img{yPos}{'1'}) {
    $img{obj}->filledRectangle (                                                          
      $img{xPos}{AlnOff},                   $_+$img{yDim}{GrSp},
      $img{xPos}{AlnOff}+1+$img{xDim}{Aln}, $_+$img{yDim}{GrSp}+$img{yDim}{Grid},
      $img{pal}{red});
  }

  # grid intervals for representation of query sequence
  if ($$pBlParse{QuerySeq}{length} <= 500) {
    $img{grid}{StepSeq}  = 10;
    $img{grid}{unit}     = '';
    $img{grid}{StepCt} = 10;
  } elsif ($$pBlParse{QuerySeq}{length} <= 10000) {
    $img{grid}{StepSeq}  = 100;
    $img{grid}{unit}     = '';
    $img{grid}{StepCt} = 100;
  } else {
    $img{grid}{StepSeq}  = 1000;
    $img{grid}{unit}     = 'K';
    $img{grid}{StepCt} = 1;
  }

  # number of intervals, grid step size
  $img{grid}{StepNumF} = $$pBlParse{QuerySeq}{length} / $img{grid}{StepSeq};
  $img{grid}{StepPix} = $img{xDim}{Aln} / ($img{grid}{StepNumF} || 0.99);
  $reallength = $img{grid}{StepPix} * $img{grid}{StepNumF};

  # draw the grids
  $img{yPos}{'0'} += $img{yDim}{Grid} + 4;  # top
  $img{yPos}{'1'} -= $img{yDim}{Grid} - 1;  # bottom
  foreach ($img{yPos}{'0'}, $img{yPos}{'1'}) {
    $img{xPos}{'0'} = $img{xPos}{AlnOff};

    for (my $CtI=0; $CtI<=$img{grid}{StepNumF}; $CtI++) {
      $img{obj}->setBrush (($CtI % 5) ? $BrushBlack : $BrushRed);
      $img{obj}->line ($img{xPos}{'0'}, $_, $img{xPos}{'0'}, $_, gdBrushed);
      $img{xPos}{'0'} += $img{grid}{StepPix};
    }
  }

  # draw the grid strings
  $img{yPos}{'0'} += 2;                           # top
  $img{yPos}{'1'} -= 2*$img{yDim}{Grid} + 5;      # bottom
  foreach ($img{yPos}{'0'}, $img{yPos}{'1'}) {
    $img{xPos}{'0'} = $img{xPos}{AlnOff} - 2;

    # loop over grid positions
    for (my $CtI=0; $CtI<=$img{grid}{StepNumF}; $CtI++) {
      if (($CtI % 5) == 0) {
        $img{grid}{label} = ($CtI * $img{grid}{StepCt}) . $img{grid}{unit};
        $img{obj}->string (gdSmallFont, $img{xPos}{'0'}, $_+1, $img{grid}{label}, $img{pal}{black});
      }
      $img{xPos}{'0'} += $img{grid}{StepPix};
    }
  }

  #############################################################################
  # enter array of matching sequences from the BLAST report

  # start output of HTML image map paragraph
  my $PathMap = &PathUnique (-name=>"$PhraseTemp.map");
  open (OUTPUT_MAP, ">$PathMap");
  print OUTPUT_MAP "<MAP NAME=HSPMAP>\n";

  # current vertical coordinate is $img{yPos}{'0'} pixels
  $img{yPos}{'0'} += $img{yDim}{Font2} + $img{dim}{space};

  # loop over sequence IDs
  if ((grep { not exists $_->{id} } values %{$$pBlParse{Match}}) and 1) {
    printf STDERR "%s. WARNING: NULL entry in list of matches\n", &MySub;
  }
  foreach my $ItMatch ( @BlMatch ) {

    # add sequence ID to current image line, add link to the image map
    $img{obj}->string (gdMediumBoldFont, $img{dim}{space}, $img{yPos}{'0'}, $$ItMatch{id}, $img{pal}{black});
    printf OUTPUT_MAP "  <AREA SHAPE=RECT COORDS=%d,%d,%d,%d HREF=\"#$$ItMatch{id}\">\n",
      $img{dim}{space}, $img{yPos}{'0'}, $img{dim}{space}+$img{xDim}{Id}, $img{yPos}{'0'}+$img{yDim}{Entry};

    # loop over each HSP belonging to the seq ID (already sorted by HSP score)
    # start/end/length values of segments are derived by &BlastParse
    foreach my $ItHsp (sort { $a->{score}<=>$b->{score} } @{$$ItMatch{HSP}}) {

      # calculate the range of the match image frame (pixels) for each HSP.
      # The maximum value will be $reallength, as evaluated for the
      # query sequence, minimum is 1.
      $img{xPos}{RelHspOff} = int (($$ItHsp{QueryBeg}-1) * $reallength / $$pBlParse{QuerySeq}{length});
      $img{xPos}{RelHspLen} = int ( $$ItHsp{QueryLen}    * $reallength / $$pBlParse{QuerySeq}{length});
      if ($img{xPos}{RelHspLen} < 1) {
        $img{xPos}{RelHspLen} = 1;
      }
      $debug and printf STDERR "%s. match %s, HSP start %d, HSP real length %d\n", &MySub,
        $$ItMatch{id}, $$ItHsp{QueryBeg}, $img{xPos}{RelHspLen};

      # draw a filled rectangle corresponding to the HSP
      # color corresponds to the score
      if ($$ItHsp{score} <= 50) {
        $$ItHsp{color} = $img{pal}{blue};
      } elsif (($$ItHsp{score} >  50) and ($$ItHsp{score} <= 100)) {
        $$ItHsp{color} = $img{pal}{grey};
      } elsif (($$ItHsp{score} > 100) and ($$ItHsp{score} <= 150)) {
        $$ItHsp{color} = $img{pal}{yellow};
      } elsif (($$ItHsp{score} > 150) and ($$ItHsp{score} <= 200)) {
        $$ItHsp{color} = $img{pal}{green};
      } else {
        $$ItHsp{color} = $img{pal}{red};
      }
      $img{obj}->filledRectangle (
        $img{xPos}{AlnOff}+$img{xPos}{RelHspOff}, $img{yPos}{'0'}+4,
        $img{xPos}{AlnOff}+1+$img{xPos}{RelHspOff}+$img{xPos}{RelHspLen}, $img{yPos}{'0'}+9,
        $$ItHsp{color});

    } # end foreach $ItHsp
    $img{yPos}{'0'} += $img{yDim}{Entry};

  } # end foreach $ItMatch

  # end of image map
  print OUTPUT_MAP "</MAP>\n";
  close (OUTPUT_MAP);

  #############################################################################
  # post-work

  # write png-formatted image to file
  my $PathImg = &PathUnique (-name=>"$PhraseTemp.png");
  &WriteFile ($PathImg, $img{obj}->png);

  # return from SUB
  $debug and printf STDERR "%s. returning successfully\n", &MySub;
  return ($PathImg, $PathMap, $pBlParse);
}


################################################################################
# BLAST report conversion
################################################################################


# format BLAST report to HTML
#
# INTERFACE
# - argument 1: file path for input of BLAST report => $FileIn
#
# - options:
#   -debug      print debug report to STDERR
#   -FullHtml   produces full HTML output. As default, the output doesn't
#               have a complete HTML document frame. opt is simply set by
#               true value.
#   -PathOut    directory path and filename stem for HTML output. As default
#               'Blast###.html' is used.
#
# - return val: - full path name for HTML output
#               - undef if an error occurred
#
# DESCRIPTION
# - the BLAST report is read as a file from the file system (file path is argument 1)
#   and the processed report is saved as a file (file path is returned).
#
# DEBUG, CHANGES, ADDITIONS
# - do matching against (\S+) before applying $reSeqid{'*'}
#
# DEVELOPER'S NOTES
# - Independent stand-alone code for inserting retrieval links into HTML-
#   formatted BLAST report exists, script BlastdbLink.pl, package blastcgi.
#
sub Blast2Html {
  my $ArgOut = 'Blast###.html';
  my ($FileIn,%opt) = @_;
  my $debug = $opt{-debug};
  if ($opt{-PathOut}){ $ArgOut=$opt{-PathOut} };

  # open input file, open output file
  my $hIn=FileHandle->new($FileIn);
  unless ($hIn) {
    die sprintf "%s. ERROR: unable to open input file $FileIn\n", &MySub;
  }
  my $PathOut = &PathUnique (-name => $ArgOut);
  my $hOut=FileHandle->new($PathOut,'w');
  unless ($hOut) {
    die "%s. ERROR: unable to open output file $PathOut\n", &MySub;
  }

  ##############################################################################
  # pre-work, HTML header

  # HTML header
  if ($opt{-FullHtml}) {
    print $hOut <<"HTML_HEADER_END";
<HTML><HEAD>
  <TITLE>BLAST report</TITLE>
</HEAD>
<BODY>
HTML_HEADER_END
  }
  print $hOut "<PRE>\n";

  # crawl on until reaching database specification
  my ($line,$buffer,$bSpecial,$db);
  while (<$hIn>) {
    $buffer .= $_;
    if (m/^Database:/i) {
      if (0) { }
      elsif (m/dicty\b.*\.prot/i) {
        $bSpecial = 'DictyProtein';
      }
      elsif (m/dicty\b.*\.protg/i) {
        $bSpecial = 'DictyProteinG';
      }
      elsif (m/\breti_\S+/) {
        $bSpecial = 'RetiSome';
        $db=$&;
      }
      print $hOut "<!-- \$bSpecial=$bSpecial -->\n";
      last;
    }
  }
  print $hOut $buffer;

  ##############################################################################
  # make links to sequence source and alignment paragraphs

  # Dictyostelium gene model mode (protein, DD ID)
  # - currently, linking to sequences is not possible
  if ($bSpecial eq 'DictyProtein') {
    # leave untouched until match index
    while (<$hIn>) {
      $buffer .= $_;
      if (m/High-scoring Segment Pairs:/i) {
        $buffer .= scalar <$hIn>;
        last;
      }
    }
    print $hOut $buffer;
    # link IDs and scores in match index
    while (<$hIn>) {
      $line = $_;
      if (m/^(dd_\d+)\b(.*\s+)(\d+)(\s+\d+\.\d+(e\-\d+)?(\s+\d+)?)$/) {
        $line = "<A HREF='http://genome.imb-jena.de/dicdi/chr2oracle_d06.pl?genmodel=$1' TARGET='_blank'>$1</A>" .
          "$2<A HREF=#$1>$3</A>$4$'";
      }
      print $hOut $line;
      if (m/^$/i) { last }
    }
    # link IDs in match segment
    while (<$hIn>) {
      $line = $_;
      if (m/^>(dd_\d+)\b/) {
        $line = "<A NAME=$1>>" .
          "<A HREF='http://genome.imb-jena.de/dicdi/chr2oracle_d06.pl?genmodel=$1' TARGET='_blank'>$1</A>" .
          $';
      }
      print $hOut $line;
    }
  }

  # Dictyostelium gene model mode (protein, gX ID)
  # - currently, linking to sequences is not possible
  elsif ($bSpecial eq 'DictyProteinG') {
    # leave untouched until match index
    while (<$hIn>) {
      $buffer .= $_;
      if (m/High-scoring Segment Pairs:/i) {
        $buffer .= scalar <$hIn>;
        last;
      }
    }
    print $hOut $buffer;
    # link IDs and scores in match index
    while (<$hIn>) {
      $line = $_;
      if (m/^(\S+_g\d+)\b(.*\s+)(\d+)(\s+\d+\.\d+(e\-\d+)?(\s+\d+)?)$/) {
        $line = "<A HREF='http://genome.imb-jena.de/dicdi/chr2oracle_name.pl?genmodel=$1' TARGET='_blank'>$1</A>" .
          "$2<A HREF=#$1>$3</A>$4$'";
      }
      print $hOut $line;
      if (m/^$/i) { last }
    }
    # link IDs in match segment
    while (<$hIn>) {
      $line = $_;
      if (m/^>(\S+_g\d+)\b/) {
        $line = "<A NAME=$1>>" .
          "<A HREF='http://genome.imb-jena.de/dicdi/chr2oracle_name.pl?genmodel=$1' TARGET='_blank'>$1</A>" .
          $';
      }
      print $hOut $line;
    }
  }

  # Reticulomyxa contigs and gene model proteins[?]
  elsif ($bSpecial eq 'RetiSome') {
    # leave untouched until match index
    while (<$hIn>) {
      $buffer .= $_;
      if (m/High-scoring Segment Pairs:/i) {
        $buffer .= scalar <$hIn>;
        last;
      }
    }
    print $hOut $buffer;
    # link IDs and scores in match index
    while (<$hIn>) {
      $line = $_;
      if (m/^((?:contig|solexa)\d+\S*\.exp(?:_\d+)?)(.*\s+)(\d+)(\s+\d+\.(?:\d+(?:e\-\d+)?)?(?:\s+\d+)?)$/) {
        $line = "<A HREF='http://genome.imb-jena.de/reti_blast/BlastdbSeq.cgi?db=$db&id=$1' TARGET='_blank'>$1</A>" .
          "$2<A HREF=#$1>$3</A>$4$'";
      }
      print $hOut $line;
      if (m/^$/i) { last }
    }
    # link IDs in match segment
    while (<$hIn>) {
      $line = $_;
      if (m/^>((?:contig|solexa)\d+\S*\.exp(?:_\d+)?)/) {
        $line = "<A NAME=$1>>" .
          "<A HREF='http://genome.imb-jena.de/reti_blast/BlastdbSeq.cgi?db=$db&id=$1' TARGET='_blank'>$1</A>" .
          $';
      }
      print $hOut $line;
    }
  }

  # default mode
  # - link between index and hit header
  # - link to sequences
  else {
    while (defined ($line=<$hIn>)) {

      # index line or alignment header line with access identifier
      # - Note brackets in regexp string $SeqLab::SeqFormat::reSeqid{All}!
      if ($line =~ m/^(>?)$reSeqid{All}/o) {
        my ($lpre,$acc,$lpost) = ($1,$2,$');

        # match index
        # => highlight high score as bookmark link to alignment
        if (!$lpre and $lpost=~m/(^.*\s+)(\d+)(\s+\d+\.\d*(e\-\d+)?\s+\d+)$/) {
          $lpost = "$1<A HREF=#$acc>$2</A>$3$'";
        }

        # alignment header
        # => link to sequence source
        my $accsv = $acc;
        $acc = &SeqidLink ($accsv);
        if ($debug and $accsv eq $acc) {
          printf STDERR "%s. linking of match identifier string failed: %s\n", &MySub, $acc;
        }
        $acc =~ s/<A[^>]+/$& TARGET='_blank'/;

        # reconstruct line
        $line = $lpre . $acc . $lpost;
      }

      # alignment header line with identifier
      elsif ($line =~ m/^>(\S+)/) {
        my ($lpre,$acc,$lpost) = ('>',$1,$');

        # alignment header => link from match index
        $lpre = "<A NAME=$acc>\n$lpre";
        $line = $lpre . $acc . $lpost;
      }

      # match index line =>
      # highlight high score as bookmark link to alignment
      elsif ($line =~ m/^(\S+)(.+\s+)(\d+)(\s+\d+\.\d*(e\-\d+)?\s+\d+)$/) {
        $line = "$1$2<A HREF=#$1>$3</A>$4$'";
      }

      # parameter header line
      elsif ($line =~ m/^Parameters:$/o) {
        $line = "</PRE>\n<H2>BLAST Parameters</H2>\n<PRE>";
      }

      # output changed line
      print $hOut $line;
    }
  }

  ##############################################################################
  # HTML header, post-work

  # HTML trailer
  print $hOut "</PRE>\n";
  if ($opt{-FullHtml}) {
    print $hOut "</BODY></HTML>\n";
  }

  # exit
  return $PathOut;
}


1;
# $Id: Blast.pm,v 1.35 2018/06/05 18:02:56 szafrans Exp $
