################################################################################
#
#  kPerl Sequence Laboratory
#  Library for Sequence Alignments
#
#  copyright (c)
#  Karol Szafranski on behalf of IMB Jena, Dept. Genome Analysis, 1998-2001,
#    szafrans@imb-jena.de
#
################################################################################
#
#  DESCRIPTION
#
# - individual description of functions at the beginning of the code blocks
#
################################################################################
#
#  FUNCTIONS, DATA
#
#   @EXPORT
#   %_LibParam
#
# - basics
#   $_LibParam{TmpManag}
#   $_LibParam{Key*}  standard keys for alignment data structure
#   $reAlnConsens
#
# - data structure and rough sequence formatting
#   &AlnClone
#   &AlnSort
#   &AlnMargins
#   &AlnCompress
#
# - I/O Clustal W, fastA
#   $_sClustalHead
#   $reClustalHead
#   $reClustalEnd
#   $_reClustalLine
#   &_ClustalToHash
#   &AlnClustalRead
#   &AlnClustalSprint
#   &_AlnClustalCompactSort
#   &AlnClustal
#   &AlnToFasta
#
# - functions focusing on sequence symbols
#   &AlnRevcompl
#   &_AlnMaskgapEnd
#   &_AlnMaskgapInside
#   &AlnResolveUnk
#
# - analysis resulting in plot of local measure
#   &AlnTrueSeqpos
#   &AlnTabCover
#   &AlnTabSmbFreq
#   &AlnTabSmbFreqCPos
#   &AlnTabConsDev
#   &AlnTabRelConserv
#   &AlnTabMatScore
#   &AlnTabFrameshift
#
# - analysis resulting in global measure
#   &AlnIdentity
#   &AlnDiversity
#   &AlnConsens
#
# - analysis resulting in complex data structure
#   &AlnTabDist
#
#
#  STD OPTIONS
#
#   -debug       print debug protocol to STDERR
#   -KeyId       use specified key to access identifier (default:
#                $_LibParam{KeyId}).
#   -KeyOff      use specified key to access identifier (default:
#                $_LibParam{KeyOff}).
#   -KeySeq      use specified key to access identifier (default:
#                $_LibParam{KeySeq}).
#   -MaskGapEnd  mask gaps at sequence ends, specify boolean value, default: 0.
#   -MaskGapIns  mask inner gaps if longer than specified length
#   -SmbUnk      explicitly specify the symbol for unknown emissions, default:
#                the appropriate symbol according to the sequence type of the
#                first alignment entry (cmp. definitions in
#                %SeqLab::SeqBench::SeqSmbUnk)
#   -timer       print time performance protocol to STDERR
#
################################################################################
#
#  DEBUG, CHANGES, ADDITIONS
#
# - In function &AlnClustal, filenames for temporary files are requested, and
#   prior to use the filename suffix is modified. This way, the manager of
#   temporary files cannot glob() successfully for these temporary files.
#
# - replace the -uplow switch by -SlcCase switch
#   &AlnIdentity
#   &AlnConsens
#   &AlnTabSmbFreq
#   &AlnTabSmbFreqCPos
#
# - A switch -CaseSensit would make sense from a theoretical point of view.
#   However, it wasn't needed so far.
#
# - look also for notes in the header of each function block
#
################################################################################

package SeqLab::Align;

# includes
use strict; #use warnings;  # OK 20040817
use FileHandle;
use MainLib::StrRegexp qw(&MatchCt);
use MainLib::Data qw(&DataPrint &ListMaxfirst);
use MainLib::Path;
use MainLib::File qw(&ReadFile);
use MainLib::FileTmp qw(&PathUnique);
use MainLib::Misc qw(&MySub);
use Math::Calc;
use SeqLab::SeqBench;
use SeqLab::SeqFormat ();
use SeqLab::SeqComp qw(&CompMatrixMean);

# symbol export
use Exporter qw(import);
our @EXPORT = qw (
  $reAlnConsens
  &AlnClone &AlnSort &AlnMargins &AlnCompress
  $reClustalEnd $reClustalHead
    &AlnClustalRead &AlnClustalSprint &AlnClustal &AlnToFasta
  &AlnRevcompl &AlnResolveUnk
  &AlnIdentity &AlnDiversity &AlnConsens
  &AlnTrueSeqpos &AlnTabCover &AlnTabSmbFreq &AlnTabSmbFreqCPos
    &AlnTabConsDev &AlnTabRelConserv &AlnTabMatScore &AlnTabFrameshift
  &AlnTabDist
  );

# package-wide constants and variables
my %_LibParam;


################################################################################
# basics
################################################################################


# handle temporary files globally
# - encapsulation in a sub{} allows MainLib::FileTmp to identify the calling
#   package
my $pcFT = sub{ $_LibParam{TmpManag} ||= MainLib::FileTmp->new(); };
&$pcFT;


# alignment data structure
#
# - structure is an array of hashes, each representing a single sequence
#   entry:
#     @Alignment->%Seq
#   So, the main structure is exactly the same as for the sequence array
#   structure, defined in SeqLab::SeqFormat.
#
# - sequence hash contains the basic fields (keys) from the sequence array
#   data structure:
#     id        identifier (only word characters and '.')
#     sequence  sequence string
#               what makes the alignment data structure a special thing beside
#               the sequence array data structure is the behaviour of the
#               sequence string.
#   The hash may contain several more fields (keys):
#     offset    sequence stretch offset referring to the start of the original
#               sequence.
#

# standard keys for alignment data structure
$_LibParam{KeyId}  = $SeqLab::SeqFormat::LibGlob{KeyId};
$_LibParam{KeyOff} = 'offset';
$_LibParam{KeySeq} = $SeqLab::SeqFormat::LibGlob{KeySeq};

# what the 'id' field of the consensus entry should look like
our $reAlnConsens = 'consens';


################################################################################
# data structure and rough sequence formatting
################################################################################


# return clone of alignment
#
# INTERFACE
# - argument 1:  reference to alignment data structure
#
# - options:
#   -debug       [STD]
#   -DelConsens  delete consensus entry if found in the mother project. A
#                consensus entry is defined by an identifier scheme,
#                $reAlnConsens
#   -KeyId       [STD]
#   -KeySeq      [STD]
#   -KeyStd      standardise returned data structure. All non-standard keys
#                will be removed.
#   -Low2Up      convert and lower-case letters to upper case
#   -LowHide     change lower-case letters to end gaps
#
# - return val:  - reference to child alignment data structure
#                - undef an error occurred
#
# DESCRIPTION
# - the function simply completely clones all hashes in the referenced array.
#   We don't apply recursive copying like in &MainLib::Misc::DataClone
#   which has these advances:
#   - extract only alignment-coding hash level from a branching
#     data structure
#   - circumvent back-references in a more complex data structure
#
sub AlnClone {
  my ($pAlnMother, %opt) = @_;
  my (%key);
  my ($pAlnChild, $CtI);

  # function parameters
  $key{mom}{id}  = $opt{-KeyId} || $_LibParam{KeyId};
  $key{mom}{off} = $_LibParam{KeyOff};
  $key{mom}{seq} = $opt{-KeySeq} || $_LibParam{KeySeq};
  $key{child}{id}  = $opt{-KeyStd} ? $_LibParam{KeyId} : $key{mom}{id};
  $key{child}{off} = $_LibParam{KeyOff};
  $key{child}{seq} = $opt{-KeyStd} ? $_LibParam{KeySeq} : $key{mom}{seq};

  # loop over entries (each being a hash)
  for ($CtI=0; $CtI<@$pAlnMother; $CtI++) {

    # exclude consensus from alignment if present
    if ($opt{-DelConsens} and
        $$pAlnMother[$CtI]{$key{mom}{id}} =~ m/$reAlnConsens/io) {
      next;
    }

    # copy entry hashes into new structure
    if ($opt{-KeyStd}) {
      push @$pAlnChild, {
        $key{child}{id}  => $$pAlnMother[$CtI]{$key{mom}{id}},
        $key{child}{off} => $$pAlnMother[$CtI]{$key{mom}{off}},
        $key{child}{seq} => $$pAlnMother[$CtI]{$key{mom}{seq}},
        };
    } else {
      push @$pAlnChild, { %{$$pAlnMother[$CtI]} };
    }

    # change lower-case characters to upper (option -Low2Up)
    if ($opt{-Low2Up}) {
      $$pAlnChild[$#$pAlnChild]{$key{child}{seq}} =~ tr/a-z/A-Z/;
    }
    # change lower-case characters to '-' (option -LowHide)
    if ($opt{-LowHide}) {
      $$pAlnChild[$#$pAlnChild]{$key{child}{seq}} =~ s/[a-z]/-/g;
    }
  }

  # exit SUB
  return $pAlnChild;
}


# sort entries of an alignment
#
# INTERFACE
# - argument 1: reference to alignment data structure
#               data is explicitly changed
#
# - options:
#   -KeyId      [STD]
#   -SortType   criterion for sorting:
#               id      sort by IDs, ascending alphanumerically. This
#                       is default criterion.
#               offset  sort by sequence offset in alignment
#
# DESCRIPTION
# - in AlnK alignment projects the consensus should be removed before
#   this action.
#
sub AlnSort {
  my ($pAln, %opt) = @_;
  my $KeyId = $opt{-KeyId} || $_LibParam{KeyId};

  # sort by sequence offset in alignment
  if ($opt{-SortType} eq 'offset') {
    foreach (@$pAln) {
      $_->{sequence} =~ m/[A-Z]/;
      $_->{TempOffset} = length $`;
    }
    @$pAln = sort { $a->{TempOffset} <=> $b->{TempOffset} or $a cmp $b; } @$pAln;
  }

  # standard sorting: IDs ascending alphanumerically
  else {
    @$pAln = sort { $a->{$KeyId} cmp $b->{$KeyId} or $a cmp $b; } @$pAln;
  }
}


# make sharp and space-reduced margins in an alignment
#
# INTERFACE
# - argument 1: reference to alignment data structure
#               data is explicitly changed
# - return val: reference to alignment, should be unchanged arg1
#
sub AlnMargins {
  my ($pAln, %opt) = @_;
  my ($CtMargin);

  # right margin fill in
  foreach (@$pAln) {
    $_->{sequence} =~ s/-+$//;                           # remove right spacing
    $CtMargin = &Max ($CtMargin, length $_->{sequence}); # get maximum length
  }
  foreach (@$pAln) {                                    # fill up to right margin
    $_->{sequence} .= '-' x ($CtMargin - (length $_->{sequence}));
  }

  # left truncate
  $CtMargin = length $$pAln[0]{sequence};
  foreach (@$pAln) {                                    # get length of consensus left space
    $CtMargin = &Min ($CtMargin, ($_->{sequence} =~ m/^-+/) ? length $& : 0);
  }
  if ($CtMargin) {
    foreach (@$pAln) {                                  # remove left space
      substr($_->{sequence}, 0, $CtMargin) = '';
    }
  }

  # exit SUB
  return $pAln;
}


# delete empty sequence entries / redundant gaps in an alignment
#
# INTERFACE
# - argument 1: reference to alignment data structure
#               data is explicitly changed
#
# - options:
#   -debug      [STD]
#
# - return val: reference to compressed alignment
#               should be unchanged arg1
#
# DESCRIPTION
# - output has nice margins guaranteed.
#
sub AlnCompress {
  my ($pAln, %opt) = @_;
  my ($debug, $dbg2);
  my ($CtI);

  # parameters, nice margins
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  &AlnMargins ($pAln, -debug=>$dbg2);

  # delete empty entries
  for ($CtI=0; $CtI<@$pAln; $CtI++) {
    if ($$pAln[$CtI]{sequence} =~ m/^-*$/) {
      $debug and print  STDERR"AlnCompress. deleting empty sequence entry no. $CtI\n";
      splice @$pAln, $CtI, 1;
      $CtI--;
    }
  }

  # remove redundant gaps
  AlnCompressRange:
  for ($CtI=0; $CtI<length($$pAln[0]{sequence}); $CtI++) {
    foreach (@$pAln) {
      if (substr($_->{sequence}, $CtI, 1) ne '-') { next AlnCompressRange }
    }
    $debug and printf STDERR "%s. deleting redundant gap at pos. $CtI\n", &MySub;
    foreach (@$pAln) {
      substr ($_->{sequence}, $CtI, 1) = '';
    }
    $CtI --;
  }

  # exit SUB
  return $pAln;
}


################################################################################
# I/O Clustal W
################################################################################


# strings / RegExps Clustal W format
my $_sClustalHead = 'CLUSTAL W (1.8) multiple sequence alignment';
our $reClustalHead = 'CLUSTAL W \(\d.\d{1,3}\) multiple sequence alignment';
my $_reClustalLine = '^([\w\|\.-]+) {3,}([a-zA-Z-]+)$';
our $reClustalEnd  = '\n[ \*\.:]{10,}(\n|$)';


# extract aligned sequences from Clustal W file
#
# INTERFACE
# - argument 1:  alignment in Clustal W format
#
# - options:
#   -debug      print debug protocol to STDOUT
#
# - return val: reference to hash with sequence ID as key, sequence as
#               entry. Upper/lower case is as in the original file. So, the
#               data is not returned in the standard alingment data
#               structure. This is done by AlnClustalRead, which itself
#               calls _ClustalToHash().
#
# DESCRIPTION
# - if there're multiple Clustal W alignments in the plain text, they're
#   all intermingled.
# - returned sequences are left in original upper/lower case syntax
# - cmp. &AlnClustalRead
#
sub _ClustalToHash {
  my $StartLine = 5;
  my ($pClustalPlain, %opt) = @_;
  my (%Entry);

  # extract identifier and corresponding sequence
  if ($opt{-debug}) {
    $$pClustalPlain =~ m/^(.*\n){$StartLine}/m;
    printf "%s. Alignment starting lines:\n", &MySub;
    print $&;
  }

  # extract identifier and corresponding sequence
  while ($$pClustalPlain =~ m/$_reClustalLine/gmo) {
    $Entry{$1} .= $2;
    if ($opt{-debug}) {
      printf "%s. Got alignment entry %s\n", &MySub, $1||"''";
    }
  }

  # exit SUB
  return \%Entry;
}


# extract alignment data structure from Clustal W plain text
#
# INTERFACE
# - argument 1: reference to plain text alignment in Clustal W format
#
# - options:
#   -ConsDel    remove consensus entry/entries if present
#   -debug      [STD]
#   -KeyId      [STD]
#   -timer      [STD]
#
# - return val: - reference to alignment data structure,
#               - undef if an error occurred
#
# DESCRIPTION
# - if there're multiple Clustal W alignments in the plain text, they're
#   all intermingled.
# - returned sequences are coded in original upper/lower case syntax
# - this SUB calls &_ClustalToHash to get the full sequences from the
#   Clustal W plain text.
#
sub AlnClustalRead {
  my ($pClustalPlain,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2 = $debug ? $debug-1 : undef;
  my $KeyId = $opt{-KeyId} || $_LibParam{KeyId};
  $debug and printf STDERR "%s. KeyId=%s (%s or %s=%s)\n", &MySub,
    $KeyId||'', $opt{-KeyId}||'', $_LibParam{KeyId}||'', $SeqLab::SeqFormat::LibGlob{KeyId}||'';
  my $bTimer = $opt{-timer}; my $time;
  my ($ClustalBlock1,$pEntry1st,@EntryFinal);

  # extract all entries: identifier and corresponding sequence
  $bTimer and $time = (times)[0];
  $pEntry1st = &_ClustalToHash ($pClustalPlain, -debug=>$dbg2);
  $bTimer and printf STDERR "%s. CPU Time for primary Clustal W parsing: %.3f s\n", &MySub, (times)[0] - $time;
  unless (%$pEntry1st) { return undef }
  $bTimer and $time = (times)[0];

  # grab first alignment block
  if ($$pClustalPlain =~ m/$reClustalEnd/o) {
    $ClustalBlock1 = $`;
  } else { return undef }

  # sort entries by order of occurrence in first alignment block
  # _reClustalLine has brackets in it!
  while ($ClustalBlock1 =~ m/$_reClustalLine/gmo) {
    my $id = $1;
    push @EntryFinal, {
      $KeyId   => $id,
      sequence => $$pEntry1st{$id},
      };

    # remove alignment entry from primary hash
    # remaining entries are entered later (unordered)
    delete $$pEntry1st{$id};

    # skip entry if it's a consensus entry and the -ConsDel option is set
    if ($opt{-ConsDel} and $id=~m/$reAlnConsens/) {
      pop @EntryFinal;
    } elsif ($debug) {
      printf STDERR "%s. next entry in ordered alignment portion: %s\n", &MySub,
        $EntryFinal[$#EntryFinal]{$KeyId};
    }
  }

  # append remaining alignment entries unordered
  while (my @temp = each(%$pEntry1st)) {
    push @EntryFinal, {
      $KeyId   => $temp[0],
      sequence => $temp[1],
      };
  }

  # debug protocol
  if ($debug) {
    printf STDERR "%s. got following sequences (total: %d):\n", &MySub, int(@EntryFinal);
    foreach (@EntryFinal) {
      printf STDERR "%s=>%s, sequence=>$_->{sequence}\n", $KeyId||'', $_->{$KeyId};
    }
  }
  $bTimer and printf STDERR "%s. CPU Time for secondary parsing and data construction: %.3f s\n", &MySub, (times)[0] - $time;

  # exit SUB
  return \@EntryFinal;
}


# format alignment to Clustal W document
#
# INTERFACE
# - argument 1: reference to alignment data structure
#
# - options:
#   -1block     print the complete alignment into one alignment block. Specify
#               boolean value, default: 0.
#   -ClipEnd    Truncate gaps at the end of sequence strings in the alignment.
#               This and more is also done with option -compact.
#   -compact    Truncate gaps at the end of sequence strings in the alignment.
#               Sort information-containing sequence strings of each alignment
#               block to the beginning.
#   -ConsensNum Add consensus base positions to the alignment
#   -debug      [STD]
#   -KeyId      [STD]
#
# - return val: Clustal W formatted alignment
#
# DESCRIPTION
# - via the fixed order of entries in the array the order of their output
#   is defined. opt -compact renders this behaviour.
#
sub AlnClustalSprint {
  my ($pAln, %opt) = @_;
  my %dim = ( StdSpace=>5, DfltBlock=>60, );  # default block dimensioning
  my ($debug, $dbg2, $KeyId);
  my ($pConsPlot, $pConsPlot10);
  my ($ClustalPlain, $StrConsNum1, $StrConsNum2,
      $CtConsStrpos, $ConsStrposNext, $ConsBasepos, $bNum);
  my (@AlnPrint, $CtLeft, $CtI);

  # parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $KeyId = $opt{-KeyId} || $_LibParam{KeyId};

  # clip end gaps
  if ($opt{-ClipEnd} or $opt{-compact}) {
    $pAln = &AlnClone ($pAln, -KeyId=>$opt{-KeyId});
    foreach (@$pAln) {
      $_->{sequence} =~ s/-*$//;
    }
  }

  # get maximum length of ID and sequence string
  $dim{maxid}  = &Max (map { length $_->{$KeyId} } @$pAln);
  $dim{maxseq} = &Max (map { length $_->{sequence} } @$pAln);

  # ID dimensioning and sequence block length
  $dim{id} = $dim{maxid} + $dim{StdSpace};
  $dim{block} = $opt{'-1block'} ? $dim{maxseq} : $dim{DfltBlock};

  # option '-ConsensNum': get consensus position plot (every 10th position)
  if ($opt{-ConsensNum}) {
    if ($$pAln[0]{$KeyId} =~ m/$reAlnConsens/io) {
      $pConsPlot = &AlnTrueSeqpos ($$pAln[0]{sequence});
      for ($CtI=10; $CtI<@$pConsPlot; $CtI+=10) {
        push @$pConsPlot10, $$pConsPlot[$CtI];
      }
      $debug and printf STDERR "%s. every 10th consensus base position: %s\n", &MySub,
        join ', ', @$pConsPlot10;
    } else {
      $opt{-ConsensNum} = 0;
    }
  }

  # output Clustal W header
  $ClustalPlain .= sprintf "%s\n\n\n", $_sClustalHead;
  if ($debug) {
    printf STDERR "%s. first regular ID: %s\n", &MySub, $$pAln[1]{$KeyId}||"''";
    printf STDERR "%s. formatting identifier with \%-$dim{id}s\n", &MySub;
  }

  # loop over alignment blocks
  my $CtPos = 0;
  while (($CtLeft = $dim{maxseq} - $CtPos) > 0) {
    $debug and printf STDERR "%s. applying consenus numbering\n", &MySub;

    # option '-ConsensNum': string for consensus base positions
    if ($opt{-ConsensNum}) {
      $debug and printf STDERR "%s. applying consenus numbering\n", &MySub;
      $CtConsStrpos = $CtPos;
      $StrConsNum1 = ' ' x $dim{id};
      $StrConsNum2 = $StrConsNum1;
      $bNum = 0;
      while (@$pConsPlot10 and $$pConsPlot10[0]<=$CtPos+$dim{block}) {
        $bNum = 1;
        $ConsStrposNext = shift @$pConsPlot10;
        $ConsBasepos += 10;
        $StrConsNum1 .= ' ' x ($ConsStrposNext - $CtConsStrpos - 1)
          . $ConsBasepos;
        $StrConsNum2 .= ' ' x ($ConsStrposNext - $CtConsStrpos - 1)
          . '|' . ' ' x ((length $ConsBasepos) - 1);
        $CtConsStrpos = $ConsStrposNext + (length $ConsBasepos) - 1;
      }
      if ($bNum) {
        $ClustalPlain .= "$StrConsNum1\n$StrConsNum2\n";
      }
    }

    # option '-compact': sort / output
    if ($opt{-compact}) {
      undef @AlnPrint;
      foreach (@$pAln) {
        push @AlnPrint, {
          $KeyId   => sprintf ("%-$dim{id}s", $_->{$KeyId}),
          sequence => substr ($_->{sequence}, $CtPos, $dim{block}),
          };
      }
      if ($CtPos > 0) {
        &_AlnClustalCompactSort (\@AlnPrint);
      }
      foreach (@AlnPrint) {
        $ClustalPlain .= sprintf "%s%s\n", $_->{$KeyId}, $_->{sequence};
      }
    }

    # output alignment block just as the alignment is
    else {
      foreach (@$pAln) {
        $ClustalPlain .= sprintf "%-$dim{id}s%s\n",
          $_->{$KeyId}, substr($_->{sequence},$CtPos,$dim{block});
      }
    }

    # terminate alignment block
    $ClustalPlain .= sprintf "%s\n\n",
      ' ' x ($dim{id} + &Min ($CtLeft, $dim{block}));

    # redefine current alignment cursor position
    $CtPos += $dim{block};
  }

  # exit SUB
  return $ClustalPlain;
}


# sort entries of an alignment block for better view
#
# INTERFACE
# - argument 1: reference to alignment data structure (for definition see above)
#               data is explicitly changed
#
sub _AlnClustalCompactSort {
  my ($pAln, %opt) = @_;
  my (@AlnSeq, @AlnEmpt);

  # delete empty entries
  for (my $CtI=0; $CtI<int(@$pAln); $CtI++) {
    if (! $$pAln[$CtI]{sequence}) {
      splice @$pAln, $CtI, 1;
      $CtI --;
    }
  }

  # sort out entries with sequence representations
  foreach (@$pAln) {
    if ($_->{sequence} =~ m/[A-Z]/) {
      push @AlnSeq, $_;
    } else {
      push @AlnEmpt, $_;
    }
  }
  @$pAln = (@AlnSeq, @AlnEmpt);
}


# perform Clustal W on array of sequence data structures and build alignment
# data structure
#
# INTERFACE
# - argument 1: reference to array of sequence data structures
#
# - options:
#   -debug      [STD]
#   -gapopen    CLUSTAL W parameter
#   -gapext     CLUSTAL W parameter
#   -KeyId      [STD]
#
# - return val: - reference to alignment data structure
#               - undef if an error occurred
#
sub AlnClustal {
  require SeqLab::SeqFormat; SeqLab::SeqFormat->import qw(&SeqentryToFasta);
  my ($paSeq,%opt) = @_;
  my $debug = $opt{-debug} || 0;
  my $dbg2 = $debug ? $debug-1 : undef;

  my ($FileSeq, $hOutSeq, $SeqType, $FileAln, $FileDnd,
      %ParamClustal, $ParamClustalPlain, $CallClustal, $LogClustal);
  my ($PlainAln, $pAln);

  # save sequences to temporary file
  $_LibParam{TmpManag}->AddSwitch(-debug=>$dbg2);
  ($FileSeq,$hOutSeq) = $_LibParam{TmpManag}->Create();
  foreach my $pSeq (@$paSeq) {
    print  $hOutSeq &SeqentryToFasta($pSeq,-KeyId=>$opt{-KeyId},-debug=>$dbg2);
  }
  $hOutSeq->close();
  $SeqType = &SeqType ($$paSeq[0]{sequence});
  $SeqType =~ s/^RNA/DNA/;

  # perform alignment using Clustal W
  $FileAln = $_LibParam{TmpManag}->Create();
  # non-safe modification of filename of temporary file
  $FileAln = &PathChgSuffix ($FileAln, 'aln', -last=>1);
  $FileDnd = &PathChgSuffix ($FileAln, 'dnd', -last=>1);
  %ParamClustal = (
    -gapopen => $opt{-gapopen} || '0.385',
    -gapext  => $opt{-gapext} || '2.20',
    -type    => $SeqType,
    );
  while (my($opt,$optval) = each(%ParamClustal)) {
    $ParamClustalPlain .= " $opt=$optval";
  }
  ## method new ... [cf. CVS]
  ## method new II
  #$CallClustal = "$CorePath{call}{clustalw} -sequences"
  #  . " -infile=$FileSeq -outfile=$FileAln $ParamClustalPlain";
  # method new III
  $CallClustal = "$CorePath{call}{clustalw} -align -quicktree"
    . " -infile=$FileSeq -outfile=$FileAln -newtree=$FileDnd $ParamClustalPlain";
  $LogClustal = &ReadFile ("$CallClustal |");
  $debug>1 and printf STDERR "%s. CLUSTAL W log:\n%s", &MySub, $LogClustal;

  # read alignment
  $PlainAln = &ReadFile ($FileAln);
  if (! defined($PlainAln)) {
    $debug and printf STDERR "%s. CLUSTAL W alignment failed - no *.aln generated\n", &MySub;
    $debug and print  STDERR "  call was: $CallClustal\n";
    return undef;
  }
  if ($debug) {
    printf STDERR "%s. CLUSTAL W alignment output - first lines:\n", &MySub;
    print  STDERR map{"$_\n"} &ListMaxfirst([split/\n/,$PlainAln],10), '-'x 10;
  }
  $pAln = &AlnClustalRead (\$PlainAln, -debug=>$dbg2);

  # tidy up, exit SUB
  unless ($debug) {
    unlink $FileSeq, $FileDnd, $FileAln;
  }
  return $pAln;
}


# output alignment entries in fastA format
#
# INTERFACE
# - argument 1: reference to alignment data structure
#
# - options:
#   -debug      [STD]
#   -KeyId      [STD]
#   -pure       remove gaps
#
# - return val: fastA-formatted alignment
#
sub AlnToFasta {
  require SeqLab::SeqFormat; SeqLab::SeqFormat->import qw(&SeqentryToFasta);
  my ($pAln,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;
  my $KeyId  = $opt{-KeyId} || $_LibParam{KeyId};

  # construct fastA plain text
  my $PlainFasta='';
  foreach (@$pAln) {
    my $phrase = $_->{offset} ?
      sprintf ('range %d to %d', $_->{offset}, $_->{offset} + (length $_->{sequence}) - 1) : '';
    $PlainFasta .= &SeqentryToFasta ($_,
      -KeyId  => $opt{-KeyId},
      -phrase => $phrase,
      -pure   => $opt{-pure},
      -debug  => $dbg2);
  }

  return $PlainFasta;
}


################################################################################
# functions focusing on sequence symbols
################################################################################


# return alignment with sequence entries being reverse-complement
#
# INTERFACE
# - argument 1: reference to alignment data structure
#               referenced data is left unchanged
# - return val: - reference to reverse-complement alignment data structure
#               - undef if an error occurred
#
# DESCRIPTION
# - the SUB is based on &AlnClone and &SeqStrRevcompl
#
sub AlnRevcompl {
  my ($pAln,%opt) = @_;
  $pAln = &AlnClone ($pAln);  # safe copy of alignment

  # change sequences
  foreach (@$pAln) {
    $_->{sequence} = &SeqStrRevcompl ($_->{sequence});
  }

  return $pAln;
}


# change end gaps in an alignment to masking character
#
# INTERFACE
# - argument 1: reference to alignment data structure
#               referenced data is explicitly changed
#
# - options:
#   -SmbMask    masking character, default: $SeqSmbMask
#
sub _AlnMaskgapEnd {
  my ($pAln, %opt) = @_;
  my ($SmbMask);

  # function parameters
  $SmbMask = $opt{-SmbMask} || $SeqSmbMask;

  # fill sequence beginnings and ends with masking character
  foreach (@$pAln) {
    if ($_->{sequence} =~ m/^-+/) {
      $_->{sequence} = ($SmbMask x length($&)) . $';
    }
    if ($_->{sequence} =~ m/-+$/) {
      $_->{sequence} = $` . ($SmbMask x length($&));
    }
  }
}


# change inner gaps in an alignment to masking character
#
# INTERFACE
# - argument 1: reference to alignment data structure
#               referenced data is explicitly changed
# - argument 2: minimal length of inner gaps which shall be masked
#
# - options:
#   -SmbMask    masking character, default: $SeqSmbMask
#
sub _AlnMaskgapInside {
  my ($pAln, $GapSize, %opt) = @_;
  my ($SmbMask);

  # function parameters
  $SmbMask = $opt{-SmbMask} || $SeqSmbMask;
  if ($GapSize < 1) { return };

  # mask big inner gaps
  foreach (@$pAln) {
    while ($_->{sequence} =~ m/([a-zA-Z])(-{$GapSize,})([a-zA-Z])/) {
      $_->{sequence} = $`.$1. ($SmbMask x length($2)) .$3.$';
    }
  }
}


# work out consensus from alignment
#
# INTERFACE
# - argument 1: reference to alignment data structure
#               referenced data is left unchanged
#
# - options:
#   -debug      [STD]
#   -KeyId      [STD]
#   -report     report unknown replacements at positions having an entropy
#               value higher than the specified value. The function returns
#               an additional value, array of hashes with keys:
#               ...
#   -SmbUnk     [STD]
#
# - return val: - reference to alignment data structure, resolved unknowns
#               - undef if an error occurred
#
# DESCRIPTION
# - the purpose of this function is to replace 'unknown characters' like
#   N/X in the sequence entries by the consensus character at that
#   particular alignment position. Though a gap may be the consensus
#   emission at that position the function tries to find the most frequent
#   LETTER for that position.
# - for mixed upper/lower-case alignments lower-case letter parts of the
#   sequences will be hidden before the replacement process and won't reappear
#   in the changed alignment. All letters in the resulting alignment will
#   be upper case.
#
sub AlnResolveUnk {
  my ($pAln, %opt) = @_;
  my ($debug, $dbg2, $KeyId);
  my ($AlnSeqType, $SmbUnk, $AlnConsens, $pSmbFreq, $pSmbList);
  my ($pSeq, %CurrSmb, $EmitSum, $EmitEntropy, @report);
  my ($CtI, $ConsStrPos, $StrRange);

  # enter sub, work out / check parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $KeyId = $opt{-KeyId} || $_LibParam{KeyId};

  # working copy of alignment
  # - upper-case copy of alignment
  # - determine sequence type
  $opt{-Low2Up} = 1;
  foreach (@$pAln) {
    if ($_->{sequence} =~ m/[A-Z]/) {
      $opt{-LowHide} = 1;
      delete $opt{-Low2Up};
      last;
    }
  }
  $pAln = &AlnClone ($pAln, -LowHide=>$opt{-LowHide}, -Low2Up=>$opt{-Low2Up}, -KeyId=>$KeyId);
  unless (int @$pAln) { return undef }
  $AlnSeqType = &SeqType ($$pAln[0]{sequence});
  $SmbUnk = $opt{-SmbUnk} || $SeqSmbUnk{$AlnSeqType};
  if ($debug) {
    printf STDERR "%s. alignment:\n", &MySub;
    printf STDERR "  sequence type: %s\n", $AlnSeqType||"''";
    printf STDERR "  unknown symbol: %s\n", $SmbUnk||"''";
  }

  # get symbol frequency plot for alignment
  ($AlnConsens, $pSmbFreq, $pSmbList) = &AlnConsens ($pAln, -KeyId=>$KeyId, -debug=>$dbg2);
  if ($debug > 1) {
    printf STDERR "%s. symbol frequency plot as prepared by \&AlnTabSmbFreq\n", &MySub;
    printf STDERR '%6s' . '%4s' x (@$pSmbList+0) . '%9s' ."\n", 'pos', @$pSmbList, 'consens';
    print  STDERR '-' x (6 + 4 * @$pSmbList + 9), "\n";
    for ($CtI=0; $CtI<@$pSmbFreq; $CtI++) {
      printf STDERR '%6s' . '%4s' x (@$pSmbList+0) . '%9s' . "\n",
        $CtI+1, @{$$pSmbFreq[$CtI]}{@$pSmbList}, $$pSmbFreq[$CtI]{consens};
    }
  }

  # loop over each alignment entry
  foreach $pSeq (@$pAln) {

    # replace 'unknown' character by most frequent letter
    while ($$pSeq{sequence} =~ m/$SmbUnk/gi) {
      $ConsStrPos = length $`;
      $StrRange = substr ($AlnConsens, &Max(0,$ConsStrPos-5), 11);
      if ($$pSmbFreq[$ConsStrPos]{ConsLett} eq $SmbUnk) {
        if (defined $opt{-report}) {
          push @report, {
            pos      => $ConsStrPos,
            id       => $$pSeq{$KeyId},
            consens  => $$pSmbFreq[$ConsStrPos]{consens},
            entropy  => 0,
            seqrange => $StrRange,
            };
        }
        next;
      } else {
        substr($$pSeq{sequence},$ConsStrPos,1) = $$pSmbFreq[$ConsStrPos]{ConsLett};
      }
      $debug and printf STDERR "%s. match for unknown symbol, id %s, pos %d, sequence range %s\n", &MySub,
        $$pSeq{$KeyId}, $ConsStrPos+1, $StrRange;

      # report of replacements above the entropy threshold
      if (defined $opt{-report}) {
        unless ($$pSmbFreq[$ConsStrPos]{entropy}) {

          # calculate letter emission sum
          %CurrSmb = map { ($_ => $$pSmbFreq[$ConsStrPos]{$_}) }
                     grep { length ($_) == 1 and $_ ne 'N' and $_ ne '-' }
                     keys %{$$pSmbFreq[$ConsStrPos]};
          unless ($EmitSum = &Sum (values %CurrSmb)) {
            printf STDERR "%s. no emission found at consensus pos. $ConsStrPos\n", &MySub;
            exit 1;
          }

          # calculate emission entropy
          undef $EmitEntropy;
          foreach (keys %CurrSmb) {
            $EmitEntropy += $$pSmbFreq[$ConsStrPos] &&
              ($$pSmbFreq[$ConsStrPos]{$_} / $EmitSum * log ($$pSmbFreq[$ConsStrPos]{$_} / $EmitSum));
          }
          $$pSmbFreq[$ConsStrPos]{entropy} = sprintf ("%.5f", abs $EmitEntropy);
        }

        # report replacement
        if ($$pSmbFreq[$ConsStrPos]{entropy} > $opt{-report}) {
          push @report, {
            pos      => $ConsStrPos,
            id       => $$pSeq{$KeyId},
            consens  => $$pSmbFreq[$ConsStrPos]{consens},
            entropy  => $$pSmbFreq[$ConsStrPos]{entropy},
            seqrange => $StrRange,
            };
        }
      }
    }
  }

  return wantarray ? ($pAln, \@report) : $pAln;
}


################################################################################
# analysis resulting in plot of local measure
################################################################################


# return true base positions of a sequence
#
# INTERFACE
# - argument 1: sequence string
# - return val: - plot, reference to array of positions
#               - undef if an error occurred
#
# DESCRIPTION
# - normally this function would be called on the consensus sequence.
#   Then it returns the true consensus positions of an alignment
# - only upper case letters are regarded as a symbol-emitting position of the
#   consensus sequence.
# - counting philosophy for position values:
#   @PlotPos    gives translation of consensus position (counting start 1) to
#               alignment position (counting start 1).
#
sub AlnTrueSeqpos {
  my ($Seq, %opt) = @_;
  my (@PlotPos);

  # no position 0 (we're counting in biological system)
  push @PlotPos, 0;

  # scan sequence string for base positions
  while ($Seq =~ m/[A-Z]/g) {
    push @PlotPos, (length $`) + 1;
  }

  return \@PlotPos;
}


# calculate coverage plot of an alignment
#
# INTERFACE
# - argument 1:  reference to alignment data structure
#                data is left unchanged
#
# - options:
#   -debug       [STD]
#   -KeyId       [STD]. This is only needed to identify an existing
#                consensus sequence entry. => &AlnClone
#   -KeySeq      [STD]
#   -MaskGapIns  [STD]
#   -PlotPos     supply a pre-made plot of consensus positions (argument is
#                an array reference). It may already be worked out by the
#                calling code.
#
# - return val:  - plot, reference to value of coverage for each consensus
#                  position
#                - undef if an error occurred
#
# DESCRIPTION
# - if the alignment contains a consensus sequence entry at the first
#   position it's disregarded in the calculation.
# - for switches -MaskGap*, 'masking' means that gaps are not regarded as
#   sequences. As a consequence, the effective coverage at these ranges
#   goes down.
# - counting philosophy for position values:
#   @$pPlotPos  gives translation of consensus position (counting start 1) to
#               alignment position (counting start 1).
#   @PlotCover  array position according to alignment position (counting start 0).
#               We don't want to introduce a pseudo-call for position 0 in order
#               to attain counting start 1.
#
# DEBUG, CHANGES, ADDITIONS
# - should extended sequence stretches been hidden before this method?
#   Switches -uplow
#
sub AlnTabCover {
  my ($pAln, %opt) = @_;
  my ($debug, $dbg2);
  my ($AlnConsens, $pPlotPos);
  my ($CtPos, $CtCover, $base, @PlotCover);

  # function parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;

  # get working copy of alignment
  # - delete empty sequence entries / redundant gaps, nice margins
  # - mask end gaps and inner gaps
  $pAln = &AlnClone ($pAln, -DelConsens=>1, -KeyStd=>1, -KeyId=>$opt{-KeyId}, -KeySeq=>$opt{-KeySeq});
    # after call of &AlnClone(...,-KeyStd=>1) we don't have to care about -KeyId and -KeySeq anymore
  unless (int @$pAln) { return undef }
  &_AlnMaskgapEnd ($pAln, -debug=>$dbg2);
  $opt{-MaskGapIns} and &_AlnMaskgapInside ($pAln, $opt{-MaskGapIns}, -debug=>$dbg2);

  # get position plot if not given
  if ($opt{-PlotPos}) {
    $pPlotPos = $opt{-PlotPos};
    $debug and printf STDERR "%s. got consensus sequence position plot from caller\n", &MySub;
  } else {
    &AlnCompress ($pAln, -debug=>$dbg2);
    $AlnConsens = &AlnConsens ($pAln, -MaskGapEnd=>1, -MaskGapIns=>$opt{-MaskGapIns}, -debug=>$dbg2);
    $pPlotPos = &AlnTrueSeqpos ($AlnConsens, -debug=>$dbg2);
  }

  # loop over consensus positions
  for ($CtPos=1; $CtPos<@$pPlotPos; $CtPos++) {
    $CtCover = 0;

    # check base presence for each alignment entry at current position
    foreach (@$pAln) {
      if (($base = substr($_->{sequence}, $$pPlotPos[$CtPos]-1, 1)) and
           $base ne $SeqSmbMask) {
        $CtCover ++;
      }
    }
    push @PlotCover, $CtCover;
  }
  unshift @PlotCover, $PlotCover[0];

  # exit SUB, return plot
  return \@PlotCover;
}


# work out base frequency table from alignment (all positions)
#
# INTERFACE
# - argument 1:  reference to alignment data structure
#                referenced data is left unchanged
#
# - options:
#   -debug       [STD]
#   -KeyId       [STD]. This is only needed to identify an existing
#                consensus sequence entry. => &AlnClone
#   -KeySeq      [STD]
#   -MaskGapEnd  [STD]
#   -MaskGapIns  [STD]
#   -ResolveUnk  resolve 'unknown' letters N/X to the most frequent letter
#                => &AlnResolveUnk.
#   -SmbUnk      [STD]
#   -uplow       regard upper and lower letters in the same way, lower case
#                temporarily changed to upper case. As a default, lower-case
#                letters are disregarded in the frequency statistics.
#                Cmp. suggestion in 'DEBUG, CHANGES, ADDITIONS - switch -CaseSensit').
#
# - return val:  - table of symbol frequencies (table data type AH)
#                  frequencies are actually the total counts of the occurrence
#                  of a given symbol
#                - undef if an error occurred (no alignment entries at all, ...)
#
# DESCRIPTION
# - if the alignment contains a consensus sequence entry at the first
#   position it's disregarded in the calculation.
# - counting philosophy for position values:
#   @TabFreq    array position according to alignment position (counting start 0).
# - for switches -MaskGap*, 'masking' means that gaps are not regarded as
#   sequences. As a consequence, the effective coverage at these ranges
#   goes down.
# - Special rules in determination of the consensus sequence:
#   N  It's only used in the consensus, if there's no real letter
#      alternative.
#   -  a gap only appears in the consensus, if it has a higher frequency
#      than all real letter alternatives together.
#
sub AlnTabSmbFreq {
  my ($pAln, %opt) = @_;
  my ($debug, $dbg2, $SmbUnk);
  my ($AlnLen, $AlnPos, %SmbAtcolWork, %SmbAtcolReal);
  my ($SmbMostfreqChar, $SmbMostfreqLett, $SmbFreqsumLett);
  my (@TabFreq, %SmbAll, @SmbAll);

  ##############################################################################
  # pre-work

  # function parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $SmbUnk = $opt{-SmbUnk} || $SeqSmbUnk{&SeqType($$pAln[0]{sequence})} or return undef;

  # working copy of alignment
  # - change letters according to -uplow option
  # - optionally resolve unknown symbol
  # - mask end gaps and inner gaps
  unless ($opt{-Low2Up} = $opt{-uplow}) {
    foreach (@$pAln) {
      if ($_->{sequence} =~ m/[A-Z]/) {
        $opt{-LowHide} = 1;
        last;
      }
    }
  }
  $pAln = &AlnClone ($pAln, -DelConsens=>1, -LowHide=>$opt{-LowHide}, -Low2Up=>$opt{-Low2Up},
    -KeyId=>$opt{-KeyId}, -KeySeq=>$opt{-KeySeq}, -KeyStd=>1);
  delete $opt{-KeyId};
  delete $opt{-KeySeq};
  unless (int @$pAln) { return undef }
  $opt{-ResolveUnk} and &AlnResolveUnk ($pAln, %opt);
  $opt{-MaskGapEnd} and &_AlnMaskgapEnd ($pAln, -debug=>$dbg2);
  $opt{-MaskGapIns} and &_AlnMaskgapInside ($pAln, $opt{-MaskGapIns}, -debug=>$dbg2);

  # start-up resources
  $AlnLen = &Max (map { length $_->{sequence} } @$pAln);
  if ($debug) {
    printf STDERR "%s. alignment:\n", &MySub;
    printf STDERR "  size: %d x %d\n", $AlnLen, int @$pAln;
    printf STDERR "  unknown symbol: %s\n", $SmbUnk;
  }

  ##############################################################################
  # symbol frequencies, consensus

  # loop over alignment positions
  for ($AlnPos=0; $AlnPos<$AlnLen; $AlnPos++) {

    # reinitialise counter
    undef %SmbAtcolWork;

    # evaluate base frequency at particular position
    foreach (@$pAln) {
      if ($AlnPos < length($_->{sequence})) {
        $SmbAtcolWork{substr($_->{sequence},$AlnPos,1)} ++;
      }
    }

    # special handling for special symbols
    # sample frequency counts of current position
    # low scoring of unknown emissions
    delete $SmbAtcolWork{$SeqSmbMask};
    #delete $SmbAtcolWork{''};
    %SmbAtcolReal = %SmbAtcolWork;  # conserve what we really have found at the current alignment column
    $SmbAtcolWork{$SmbUnk} and $SmbAtcolWork{$SmbUnk} = 0.5;

    # update hash of all used symbols
    foreach (keys %SmbAtcolWork) { $SmbAll{$_} = 1; }

    # derive most frequent symbol
    # for equal frequencies the consensus decision is based on descendent
    #   alphabetical order
    ($SmbMostfreqChar) = sort {
      $SmbAtcolWork{$b} <=> $SmbAtcolWork{$a} or $b cmp $a or $b <=> $a;
      } keys %SmbAtcolWork;
    unless ($SmbAtcolWork{$SmbMostfreqChar}) {  # default for an empty column
      $SmbMostfreqChar = '-';
    }

    # derive most frequent letter and frequency sum of letters
    # for equal frequencies the consensus decision is made in descendent
    #   alphabetical order of available letters
    $SmbFreqsumLett = &Sum (map { $SmbAtcolWork{$_} } grep { m/[a-z]/i } keys %SmbAtcolWork);
    unless (($SmbMostfreqLett = $SmbMostfreqChar) =~ m/[a-z]/i) {
      $SmbMostfreqLett = (sort {
        $SmbAtcolWork{$b} <=> $SmbAtcolWork{$a} or $b cmp $a;
        } grep { m/[a-z]/i } keys %SmbAtcolWork)[0];
      unless ($SmbAtcolWork{$SmbMostfreqLett||''}) {
        $SmbMostfreqLett = 'N';
      }
    }

    # sample frequency counts of current position to array
    push @TabFreq, { %SmbAtcolReal,
      consens  => ($SmbFreqsumLett>=($SmbAtcolReal{$SmbMostfreqChar||''}||0)) ?
                  $SmbMostfreqLett : $SmbMostfreqChar,
      ConsLett => $SmbMostfreqLett,
      LettFreq => $SmbFreqsumLett,
      cover    => &Sum (values %SmbAtcolReal),
      };
  }

  ##############################################################################
  # refinement, tidy up

  # no leading / final stretches of poly(UNK)
  foreach (@TabFreq) {
    if ($_->{consens} eq $SmbUnk) {
      $_->{consens} = '-';
    } else { last }
  }
  foreach (reverse @TabFreq) {
    if ($_->{consens} eq $SmbUnk) {
      $_->{consens} = '-';
    } else { last }
  }

  if (wantarray) {
    # prepare sorted array of symbols
    @SmbAll = sort {
      ($a eq '-') ? -1:0 or ($b eq '-') ? 1:0 or
      $a cmp $b;
      } keys %SmbAll;
    return (\@TabFreq, \@SmbAll);
  } else {
    return \@TabFreq;
  }
}


# work out base frequency plot from alignment (consensus positions)
#
# INTERFACE
# - argument 1:  reference to alignment data structure
#                referenced data is left unchanged
#
# - options:
#   -debug       [STD]
#   -KeyId       [STD]. This is only needed to identify an existing
#                consensus sequence entry. => &AlnTabSmbFreq
#   -MaskGapEnd  [STD] => &AlnTabSmbFreq
#   -MaskGapIns  [STD] => &AlnTabSmbFreq
#   -ResolveUnk  resolve 'unknown' letters N/X to the most frequent letter.
#                => &AlnTabSmbFreq => &AlnResolveUnk
#   -uplow       regard upper and lower letters in the same way, lower case
#                temporarily changed to upper case. As a default, lower-case
#                letters are disregarded in the frequency statistics.
#                => &AlnTabSmbFreq
#                Cmp. suggestion in 'DEBUG, CHANGES, ADDITIONS - switch -CaseSensit').
#
# - return val:  - table of symbol frequencies and the other stuff (table
#                  data type AH)
#                - undef if an error occurred (no alignment entries at all, ...)
#
# DESCRIPTION
# - comparison with  &AlnTabSmbFreq:
#   - Here, the frequencies are plotted against the
#     effective consensus sequence positions. As a consequence, the emission
#     type 'insertion' (entry ins) arises as a new entry in the hash of
#     emission possibilities. The count for insertions is held in the
#     entry of the next following consensus position.
#   - Here, an emission entropy for each resulting consensus sequence
#     position is calculated.
#   - In &AlnTabSmbFreq, alignment positions were not counted in biological
#     system because we're not dealing with a real sequence. Position 0 is the
#     first alignment position where any sequence entry starts to emit a symbol.
#     Here, we keep the counting philosophy because we're still dealing with
#     an array (array of consensus positions). Would it make more sense to
#     insert an empty position entry for position 0?
# - details of emission entropy calculation:
#   - 'unknown' letters are regarded as several independet emissions, unless
#     the -ResolveUnk option is set.
#   - gaps are regarded as several independet emissions, unless the -MaskGapIns
#     option takes effect.
# - if the alignment contains a consensus sequence entry at the first
#   position it's disregarded in the calculation, see &AlnTabSmbFreq.
# - for switches -MaskGap*, 'masking' means that gaps are not regarded as
#   sequences. As a consequence, the effective coverage at these ranges
#   goes down.
# - counting philosophy for position values:
#   @$pSmbFreq    array position according to alignment position (counting start 0).
#   @SmbFreqCPos  array position according to alignment position (counting start 0).
#
sub AlnTabSmbFreqCPos {
  my ($pAln, %opt) = @_;
  my ($debug);
  my ($pSmbFreq, $pSmb);
  my ($pSmbFreqCurr, $CtI, $CtIns, $CtCurrIns);
  my (%CurrSmb, $EmitSum, $EmitEntp, @SmbFreqCPos);

  # function parameters, pre-work
  $debug = $opt{-debug};
  unless (int @$pAln) { return undef }

  # get symbol frequency plot for alignment
  # - frequencies are actually the total counts of the occurrence
  #   of a given symbol
  ($pSmbFreq, $pSmb) = &AlnTabSmbFreq ($pAln, %opt);

  # loop over each alignment position
  for ($CtI=0; $CtI<@$pSmbFreq; $CtI++) {
    unless (defined ($pSmbFreqCurr = $$pSmbFreq[$CtI])) { next }
    $debug and printf STDERR "%s. calculation has reached position %d of %d\n", &MySub, $CtI, int @$pSmbFreq;

    # count following positions showing an insertion (no consensus symbol)
    # cases of insertion are scored the same way, no differentiation for
    # nature of inserted sequence
    undef $CtIns;
    while ($CtI+1 < @$pSmbFreq and $$pSmbFreq[$CtI+1]{consens} eq '-') {

      # count real symbol emissions at next position
      # symbols are represented by upper-case letters only
      undef $CtCurrIns;
      foreach (keys %{$$pSmbFreq[$CtI+1]}) {
        if ($_ =~ m/[A-Z]/) { $CtCurrIns += $$pSmbFreq[$CtI+1]{$_}; }
      }

      # get maximum of inserted sequence stretches
      $CtIns = &Max ($CtIns, $CtCurrIns);

      $CtI ++;
    }
    # enter new 'ins' entry in count hash
    $CtIns and $$pSmbFreqCurr{ins} = $CtIns;

    # enter new emission sum entry in count hash
    %CurrSmb = map { ($_ => $$pSmbFreqCurr{$_}) }
               grep { (length($_) == 1 or $_ eq 'ins') and $$pSmbFreqCurr{$_} }
               keys %$pSmbFreqCurr;
    unless ($EmitSum = &Sum (values %CurrSmb)) { next };
    $$pSmbFreqCurr{sum} = $EmitSum;

    # enter new entropy entry in count hash
    # - we calculate the absolute value of entropy rather than original signed
    #   value
    # - avoid zero or negative values for any $$pSmbFreqCurr{$_}!
    $EmitEntp = 0;
    foreach (keys %CurrSmb) {
      $EmitEntp += $$pSmbFreqCurr{$_} &&
        ($$pSmbFreqCurr{$_} / $EmitSum * log ($$pSmbFreqCurr{$_} / $EmitSum));
    }
    $$pSmbFreqCurr{entropy} = sprintf ("%.5f", $EmitEntp);

    # enter expanded count hash to new plot array
    push @SmbFreqCPos, $pSmbFreqCurr;

  } # loop for plot position
  undef $pSmbFreq;

  # udpate sorted array of symbols
  if ($$pSmb[0] eq '-') { shift @$pSmb; }
  push @$pSmb, ('-', 'ins', 'sum', 'consens', 'entropy');

  # exit SUB
  $debug and printf STDERR "%s. reached end of SUB\n", &MySub;
  return wantarray ? (\@SmbFreqCPos, $pSmb) : \@SmbFreqCPos;
}


# table of consensus deviations in alignment
#
# INTERFACE
# - argument 1:  reference to alignment data structure
#                referenced data is left unchanged
#
# - options:
#   -debug       [STD]
#   -KeyId       [STD]. This is only needed to identify an existing
#                consensus sequence entry. => &AlnClone
#   -KeySeq      [STD]
#   -MaskGapEnd  [STD]
#   -MaskGapIns  [STD]. This has effect only on consensus calculation in
#                &AlnTabSmbFreq.
#   -ResolveUnk  resolve 'unknown' letters N/X to the most frequent letter
#                => &AlnResolveUnk.
#   -SmbUnk      [STD]
#
# - return val:  - reference to array of deviation entries (cp. above)
#                - undef if an error occurred (no alignment entries at all etc.)
#
# DESCRIPTION
# - if the alignment contains a consensus sequence entry at the first
#   position it's disregarded in the calculation.
# - upper and lower letters are regarded in the same way, lower case
#   changed to upper case (in the working copy of the alignment).
#   Cmp. suggestion in 'DEBUG, CHANGES, ADDITIONS - switch -CaseSensit').
# - for switches -MaskGap*, 'masking' means that gaps are not regarded as
#   sequences. As a consequence, the effective coverage at these ranges
#   goes down.
# - during search, consensus deviations are organized in a tree
#   (%TreeConsDev) in order to to group identical deviations
#   together while proceeding over the array of aligned sequences:
#   - hash position -> hash class -> hash emission -> deviation entry
#   - deviation entry:
#     pos    ...
#     class  classification of the deviation: SNP, ins, del
#     emit   particular sequence emission
#     freq   relative frequency of the polymorphism
#     ...
# - counting philosophy for position values:
#   @$pSmbFreq   array position according to alignment position (counting start 0).
#   @ConsDev     array position according to alignment position (counting start 0).
#   $AlnPos      alignment position in computational system (counting start 0).
#   $$pConsDevCurr{pos}  alignment position in biological system (counting start 1).
#   $$pConsDevCurr{ConsPos}  alignment position in biological system (counting start 1).
#
sub AlnTabConsDev {
  my ($pAln, %opt) = @_;
  my ($debug, $dbg2, %MaskGapOpt);
  my ($pSmbFreq, $AlnLen, @PosAln2Cons);
  my ($AlnPos, $ConsPos, $ConsChar, $SeqChar, $sSeqSub, %TreeConsDev);
  my ($pEntry, $class, $emit, $pConsDevCurr, @ConsDev);
  my ($CtJ);

  # function parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  %MaskGapOpt = (map { ($_=>$opt{$_}) } '-MaskGapEnd', '-MaskGapIns');

  # working copy of alignment
  # - optionally resolve unknown symbol
  # - mask end gaps only
  $pAln = &AlnClone ($pAln, -DelConsens=>1, -Low2Up=>1, -KeyStd=>1, -KeyId=>$opt{-KeyId}, -KeySeq=>$opt{-KeySeq});
    # after call of &AlnClone(...,-KeyStd=>1) we don't have to care about -KeyId and -KeySeq anymore
  unless (int @$pAln) { return undef }
  $opt{-ResolveUnk} and &AlnResolveUnk ($pAln, -debug=>$dbg2);
  $opt{-MaskGapEnd} and &_AlnMaskgapEnd ($pAln, -debug=>$dbg2);

  # start-up resources
  $pSmbFreq = &AlnTabSmbFreq ($pAln, %MaskGapOpt, -debug=>$dbg2);
  $AlnLen = @$pSmbFreq;
  $debug and printf STDERR "%s. alignment:\n", &MySub;
  $debug and printf STDERR "  size: %d x %d\n", $AlnLen, int @$pAln;

  # loop over alignment positions
  %TreeConsDev = ();
  for ($AlnPos=0; $AlnPos<$AlnLen; $AlnPos++) {
    $ConsChar = $$pSmbFreq[$AlnPos]{consens};
    $ConsPos += ($ConsChar =~ m/[a-z]/i) ? 1 : 0;
    push @PosAln2Cons, $ConsPos;

    # loop over non-masked sequence entries
    foreach $pEntry (@$pAln) {
      $SeqChar = substr ($$pEntry{sequence}, $AlnPos, 1) or next;
      if ($SeqChar eq $SeqSmbMask) { next }

      # compare sequence emission at particular position to consensus emission
      # deviation?
      if ($SeqChar ne $ConsChar) {

        # debug
        $debug and printf STDERR "%s. found deviation: at pos %d cons has %s, seq %s has %s\n", &MySub,
          $AlnPos, $ConsChar, $$pEntry{id}, $SeqChar;

        # SNP or AA switch
        if ($ConsChar =~ m/[a-z]/i and $SeqChar =~ m/[a-z]/i) {
          $debug and printf STDERR "%s. classified as SNP / AA switch\n", &MySub;
          push @{$TreeConsDev{$AlnPos}{SNP}{$SeqChar}{id}}, $$pEntry{id};
          next;
        }

        # insertion, start of insertion?
        if ($ConsChar eq '-') {
          if ($AlnPos == 0 or $$pSmbFreq[$AlnPos-1]{consens} =~ m/[a-z]/i) {
            $debug and printf STDERR "%s. classified as start of insertion\n", &MySub;

            # grab full inserted sequence
            $CtJ = 1;
            while ($AlnPos+$CtJ < @$pSmbFreq
               and $$pSmbFreq[$AlnPos+$CtJ]{consens} eq '-')
               { $CtJ ++; };
            $sSeqSub = substr ($$pEntry{sequence}, $AlnPos, $CtJ);
            push @{$TreeConsDev{$AlnPos}{ins}{$sSeqSub}{id}}, $$pEntry{id};
            next;
          }

          # NOTE: in an insertion we may find several more polymorphisms
          # we don't further sub-classify them here.
        }

        # deletion
        if ($SeqChar eq '-') {

          # start deletion?
          if ($AlnPos == 0 or substr ($$pEntry{sequence}, $AlnPos-1, 1) =~ m/[a-z]/i) {
            $debug and printf STDERR "%s. classified as deletion\n", &MySub;
            $sSeqSub = substr ($$pEntry{sequence}, $AlnPos, length $$pEntry{sequence});
            $sSeqSub =~ m/^-+/;
            $CtJ = length $&;
            push @{$TreeConsDev{$AlnPos}{del}{$CtJ}{id}}, $$pEntry{id};
            next;
          }
        }
      }
    }
  }
  # debug
  $debug and printf STDERR "%s. tree data structure for deviations:\n", &MySub;
  $debug and &DataPrint(\%TreeConsDev,-handle=>\*STDERR,-space=>2);

  # re-organise data
  foreach $AlnPos (keys %TreeConsDev) {
    foreach $class (keys %{$TreeConsDev{$AlnPos}}) {
      if ($class =~ m/^Cons/i) { next }
      foreach $emit (keys %{$TreeConsDev{$AlnPos}{$class}}) {
        $pConsDevCurr = $TreeConsDev{$AlnPos}{$class}{$emit};
        $$pConsDevCurr{pos}     = $AlnPos + 1;
        $$pConsDevCurr{ConsPos} = $PosAln2Cons[$AlnPos];
        $$pConsDevCurr{ConsSmb} = $$pSmbFreq[$AlnPos]{consens};
        $$pConsDevCurr{class} = $class;
        $$pConsDevCurr{emit}  = $emit;
        $$pConsDevCurr{freq}  = int (@{$$pConsDevCurr{id}}) / $$pSmbFreq[$AlnPos]{cover};
        $$pConsDevCurr{id}    = join (' ', @{$$pConsDevCurr{id}});
        push @ConsDev, $pConsDevCurr;
      }
    }
  }

  # exit SUB
  return \@ConsDev;
}


# calculate plot of local conservation for an alignment
#
# INTERFACE
# - argument 1:  reference to alignment data structure
#                data is left unchanged
#
# - options:
#   -debug       [STD]
#   -KeyId       [STD]. This is only needed to identify an existing
#                consensus sequence entry. => &AlnClone
#   -KeySeq      [STD]
#   -MaskGapEnd  [STD]
#   -MaskGapIns  [STD]
#   -ResolveUnk  resolve 'unknown' letters N/X to the most frequent letter
#                => &AlnResolveUnk.
#
# - return val:  - plot, reference to value of conservation for each consensus
#                  position
#                - undef if an error occurred
#
# DESCRIPTION
# - conservation is expressed as a floating point value 0 < RelIdentity < 1.
# - for switches -MaskGap*, 'masking' means that gaps are not regarded as
#   sequences. As a consequence, the effective coverage at these ranges
#   goes down.
# - counting philosophy for position values:
#   @$pPlotPos    gives translation of consensus position (counting start 1) to
#                 alignment position (counting start 1).
#   @PlotConserv  array position according to alignment position (counting start 0).
#
# DEBUG, CHANGES, ADDITIONS
# - should extended sequence stretches been hidden before this method?
#   This may be more or less part of the calling functions in SeqLab::AlnProj.pm etc.
#
sub AlnTabRelConserv {
  my ($pAln, %opt) = @_;
  my ($debug, $dbg2, %MaskGapOpt);
  my ($AlnCons, $pPlotPos, $pPlotCover);
  my ($CtPos, $StrposCurr, $StrposNext, $AlnConsFrag,
      $CtIdentity, @PlotConserv);

  # function parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  %MaskGapOpt = (map { ($_=>$opt{$_}) } '-MaskGapEnd', '-MaskGapIns');

  # get working copy of alignment
  # - delete empty sequence entries / redundant gaps, nice margins
  # - optionally resolve unknown symbol
  # - mask end gaps and inner gaps
  $pAln = &AlnClone ($pAln, -DelConsens=>1, -KeyStd=>1, -KeyId=>$opt{-KeyId}, -KeySeq=>$opt{-KeySeq});
    # after call of &AlnClone(...,-KeyStd=>1) we don't have to care about -KeyId and -KeySeq anymore
  $opt{-ResolveUnk} and &AlnResolveUnk ($pAln, -debug=>$dbg2);
  &AlnCompress ($pAln, -debug=>$dbg2);
  $opt{-MaskGapEnd} and &_AlnMaskgapEnd ($pAln, -debug=>$dbg2);
  $opt{-MaskGapIns} and &_AlnMaskgapInside ($pAln, $opt{-MaskGapIns}, -debug=>$dbg2);

  # fresh consensus, position plot, coverage plot
  $AlnCons = &AlnConsens ($pAln, %MaskGapOpt, -debug=>$dbg2);
  $pPlotPos = &AlnTrueSeqpos ($AlnCons, -debug=>$dbg2);
  push @$pPlotPos, $$pPlotPos[$#{$pPlotPos}] + 1;
  $pPlotCover = &AlnTabCover ($pAln, -PlosPos=>$pPlotPos, %MaskGapOpt, -debug=>$dbg2);

  # loop over consensus positions
  for ($CtPos=1; $CtPos<(@$pPlotPos-1); $CtPos++) {

    # translate biological positions to string positions
    $CtIdentity = 0;
    $StrposCurr = $$pPlotPos[$CtPos] - 1;
    $StrposNext = $$pPlotPos[$CtPos+1] - 1;
    $AlnConsFrag = substr ($AlnCons, $StrposCurr, $StrposNext-$StrposCurr);

    # check identity for each alignment entry at current position
    foreach (@$pAln) {
      if (substr($_->{sequence},$StrposCurr,$StrposNext-$StrposCurr) eq $AlnConsFrag) {
        $CtIdentity ++;
      }
    }
    push @PlotConserv, ($$pPlotCover[$CtPos] ? $CtIdentity / $$pPlotCover[$CtPos] : 0);
  }

  # exit SUB, return plot
  return \@PlotConserv;
}


# calculate plot of local mean matrix score for an alignment
#
# INTERFACE
# - argument 1:  reference to alignment data structure
#                data is left unchanged
#
# - options:
#   -debug       [STD]
#   -KeyId       [STD]. This is only needed to identify an existing
#                consensus sequence entry. => &AlnClone
#   -KeySeq      [STD]
#   -MaskGapEnd  [STD]
#   -MaskGapIns  [STD]
#   -ResolveUnk  resolve 'unknown' letters N/X to the most frequent letter
#                => &AlnResolveUnk.
#   -table       matrix table, default: BLOSUM40
#
# - return val:  - plot, reference to value of conservation for each consensus
#                  position
#                - undef if an error occurred
#
# DESCRIPTION
# - for switches -MaskGap*, 'masking' means that gaps are not regarded as
#   sequences. As a consequence, the effective coverage at these ranges
#   goes down.
# - counting philosophy for position values:
#   @$pPlotPos     gives translation of consensus position (counting start 1) to
#                  alignment position (counting start 1).
#   @PlotMatrixScore  array position according to alignment position (counting start 0).
#
# DEBUG, CHANGES, ADDITIONS
# - should extended sequence stretches been hidden before this method?
#   This may be more or less part of the calling functions in SeqLab::AlnProj.pm etc.
#
sub AlnTabMatScore {
  my ($pAln, %opt) = @_;
  my ($debug, $dbg2, %MaskGapOpt);
  my ($AlnCons, $pPlotPos);
  my ($CtPos, $CtStrpos, @smb, @PlotMatrixScore);

  # function parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  %MaskGapOpt = (map { ($_=>$opt{$_}) } '-MaskGapEnd', '-MaskGapIns');

  # get working copy of alignment
  # - delete empty sequence entries / redundant gaps, nice margins
  # - optionally resolve unknown symbol
  # - mask end gaps and inner gaps
  $pAln = &AlnClone ($pAln, -DelConsens=>1, -KeyStd=>1, -KeyId=>$opt{-KeyId}, -KeySeq=>$opt{-KeySeq});
    # after call of &AlnClone(...,-KeyStd=>1) we don't have to care about -KeyId and -KeySeq anymore
  $opt{-ResolveUnk} and &AlnResolveUnk ($pAln, -debug=>$dbg2);
  &AlnCompress ($pAln, -debug=>$dbg2);
  $opt{-MaskGapEnd} and &_AlnMaskgapEnd ($pAln, -debug=>$dbg2);
  $opt{-MaskGapIns} and &_AlnMaskgapInside ($pAln, $opt{-MaskGapIns}, -debug=>$dbg2);

  # fresh consensus, position plot, coverage plot
  $AlnCons = &AlnConsens ($pAln, %MaskGapOpt, -debug=>$dbg2);
  $pPlotPos = &AlnTrueSeqpos ($AlnCons, -debug=>$dbg2);
  push @$pPlotPos, $$pPlotPos[$#{$pPlotPos}] + 1;

  # loop over consensus positions
  for ($CtPos=1; $CtPos<@$pPlotPos; $CtPos++) {
    # translate biological positions to string positions
    $CtStrpos = $$pPlotPos[$CtPos] - 1;

    # filter masked entries, convert gaps to 'unknown'
    @smb = (
      map { m/^-$/ ? 'X' : $_; }
      map { m/^Z$/ ? () : $_; }
      map { substr ($_->{sequence},$CtStrpos,1) }
      @$pAln);

    # mean matrix score at current position
    push @PlotMatrixScore, &CompMatrixMean (\@smb, -table=>$opt{-table}, -debug=>$dbg2);
  }

  # exit SUB, return plot
  return \@PlotMatrixScore;
}


# local frameshift mutation plot for an alignment
#
# INTERFACE
# - argument 1: reference to alignment data structure
#               data is left unchanged
#
# - options:
#   -debug      [STD]
#   -KeyId      [STD]. This is only needed to identify an existing
#               consensus sequence entry. => &AlnClone
#   -KeySeq     [STD]
#
# - return val: - plot, reference to value of conservation for each consensus
#                 position
#               - undef if an error occurred
#
# DESCRIPTION
# - mutational distribution is expressed as a floating point value:
#   0 < RelMutFrequ < 1.
# - wouldn't masking of inner gaps have an effect on this analysis?
#   - it definitely has an effect on consensus calculation
#   - so far, this is such a specialized function that we do not have to
#     care about universality of the procedure
# - counting philosophy for position values:
#   @$pPlotPos      gives translation of consensus position (counting start 1) to
#                   alignment position (counting start 1).
#   @$pPlotCover    array position according to alignment position (counting start 0).
#   @PlotFrameshift array position according to alignment position (counting start 0).
#
# DEBUG, CHANGES, ADDITIONS
# - specify frameshift score in relation to local coverage (given in
#   @$pPlotCover).
# - should extended sequence stretches been hidden before this method?
#   This may be more or less part of the calling functions in SeqLab::AlnProj.pm etc.
#   => establish switch -upper
#
sub AlnTabFrameshift {
  my $MaxFramedel = 15;
  my ($pAln, %opt) = @_;
  my ($debug, $dbg2);
  my ($AlnCons, $pPlotPos, $pPlotCover);
  my ($PosMaskAnte, $PosMaskPost);
  my ($pSeq, $CtPos, $StrposCurr, $StrposNext, $AlnConsFrag, $SeqFrag);
  my ($CtDel, @SeqMut, @PlotFrameshift);

  # function parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;

  # working copy of alignment
  # - delete empty sequence entries / redundant gaps, nice margins
  # - mask end gaps [and inner gaps?]
  $pAln = &AlnClone ($pAln, -DelConsens=>1, -KeyStd=>1, -KeyId=>$opt{-KeyId}, -KeySeq=>$opt{-KeySeq});
    # after call of &AlnClone(...,-KeyStd=>1) we don't have to care about -KeyId and -KeySeq anymore
  &AlnCompress ($pAln, -debug=>$dbg2);
  &_AlnMaskgapEnd ($pAln, -debug=>$dbg2);
    # this has effect on 5'/3'-skipping later on

  # fresh consensus, position plot, coverage plot
  $AlnCons = &AlnConsens ($pAln, -MaskGapEnd=>1, -MaskGapIns=>0, -debug=>$dbg2);
  $pPlotPos = &AlnTrueSeqpos ($AlnCons, -debug=>$dbg2);
  push @$pPlotPos, $$pPlotPos[$#{$pPlotPos}] + 1;
  $pPlotCover = &AlnTabCover ($pAln, -PlosPos=>$pPlotPos, -MaskGapEnd=>1, -MaskGapIns=>0, -debug=>$dbg2);

  # loop over each alignment entry
  push @PlotFrameshift, 0;
  foreach $pSeq (@$pAln) {

    # locate non-emitting ranges at the end of sequence
    # inner gaps are unmasked, see call of &_AlnMaskgapEnd and &AlnTabCover
    $$pSeq{sequence} =~ m/^(${SeqSmbMask}*)([^${SeqSmbMask}]+)${SeqSmbMask}*$/;
    ($PosMaskAnte, $PosMaskPost) = (length ($1), length ($1) + length ($2));

    # skip 5' non-emitting range of sequence
    @SeqMut = (0);
    for ($CtPos=1; $CtPos<(@$pPlotPos-1); $CtPos++) {
      if ($$pPlotPos[$CtPos] > $PosMaskAnte) { last }
      push @SeqMut, 0;
    }

    # loop over consensus positions on emitting range of sequence
    $CtDel = 0;
    for ( ; $CtPos<(@$pPlotPos-1); $CtPos++) {
      if ($$pPlotPos[$CtPos] > $PosMaskPost) { last }

      # translate biological positions to string positions
      $StrposCurr = $$pPlotPos[$CtPos] - 1;
      $StrposNext = $$pPlotPos[$CtPos+1] - 1;
      $AlnConsFrag = substr ($AlnCons, $StrposCurr, $StrposNext-$StrposCurr);
      $SeqFrag  = substr($$pSeq{sequence}, $StrposCurr, $StrposNext-$StrposCurr);

      # sequence equals consensus
      if ($SeqFrag eq $AlnConsFrag) {
        if ($CtDel) {
          if ($CtDel % 3 or $CtDel > $MaxFramedel) {
            push @SeqMut, (1/$CtDel) x $CtDel;
          } else {
            push @SeqMut, (0) x $CtDel;
          }
          $CtDel = 0;
        }
        push @SeqMut, 0;
        next;
      }

      # sequence is deleted
      # a dispersion of the frameshift count over the deleted range is made
      if ($SeqFrag =~ m/^-+$/) {
        $CtDel ++;
        next;
      }

      # sequence has an insertion
      my $CtIns;
      if (
        ($CtIns=&MatchCt(\$SeqFrag,'[a-zA-Z]')-1) and
        ($CtIns%3 or $CtIns>$MaxFramedel)
      ) {
        push @SeqMut, 1;
        next;
      }

      # base exchange
      push @SeqMut, 0;
    }

    # sum values
    for ($CtPos=0 ; $CtPos<@SeqMut; $CtPos++) {
      $PlotFrameshift[$CtPos] += $SeqMut[$CtPos];
    }

    # debug for current sequence
    if ($debug) {
      printf STDERR "%s. frameshift plot for ID %s\n", &MySub, $$pSeq{id}||"''";
      printf STDERR "  ante masked pos.: %d\n", $PosMaskAnte;
      printf STDERR "  post masked pos.: %d\n", $PosMaskPost;
      for ($CtPos=0 ; $CtPos<@SeqMut; $CtPos++) {
        printf STDERR "    %d -> %.2f\n", $CtPos, $SeqMut[$CtPos];
      }
    }
  }

  # exit SUB, return plot
  return \@PlotFrameshift;
}


################################################################################
# analysis resulting in global measure
################################################################################


# check degree of overall identity in an alignment
#
# INTERFACE
# - argument 1:  reference to alignment data structure
#                referenced data is left unchanged
#
# - options:
#   -debug       [STD]
#   -KeySeq      [STD]
#   -LenRef      The calculation of sequence range length is referring to the
#                length of the ungapped consensus sequence, not on the number
#                of positions showing an effective identity/mismatch (default).
#                The option is a boolean type.
#   -MaskGapEnd  [STD] *** implement me ***
#   -MaskGapIns  [STD] *** implement me ***
#   -MatchUnk    regard this letter (e.g. 'N' or 'X') as matching to any other
#                emission other than a gap (letter '-').
#   -uplow       disregard the difference between upper and lower-case letters
#                (default: see description above). With this switch set the
#                function works faster than in the default mode.
#
# - return val:  - relative identity (0 <= I <= 1)
#                - undef if an error occurred (no effective sequence range
#                  length, no alignment entries at all, ...)
#
# DESCRIPTION
# - cmp. &AlnTabRelConserv for positionally resolved identity calculation.
# - end gaps in the sequences are disregarded in the identity check.
# - If sequence letters of both, lower and upper case are encountered, those
#   of lower case are hidden ('gapped out') from the working alignment,
#   and hence disregarded in the identity statistics. This behaviour can
#   be switched off using -uplow => TRUE.
# - The first sequence entry is regarded as the master sequence entry.
#   For option -LenRef the calculation of sequence range length is referring
#   to the length of the ungapped master sequence.
#
sub AlnIdentity {
  my ($pAln, %opt) = @_;
  my ($debug, $dbg2, $KeySeq, $MatchUnk);
  my ($LenBlock, $CtLen, %CtBase, $CtTemp);
  my ($CtIdent, $LenRef);

  # function parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $KeySeq = $opt{-KeySeq} || $_LibParam{KeySeq};
  $MatchUnk = $opt{-MatchUnk} || $SeqSmbUnk{&SeqType($$pAln[0]{$KeySeq})};
  $MatchUnk = $opt{-uplow} ? uc $MatchUnk : $MatchUnk;
  if ($MatchUnk !~ m/^[a-z]$/i) {
    printf STDERR "%s. ERROR: unknown symbol is undefined or non-letter: %s\n", &MySub, $MatchUnk||"''";
    exit 1;
  }
  if ($debug) {
    printf STDERR "%s. function entered\n", &MySub;
    printf STDERR "  first sequence: %s, %s\n", $$pAln[0]{id}, $$pAln[0]{$KeySeq};
    printf STDERR "  unknown symbol: %s\n", $MatchUnk;
  }

  # working copy of alignment according to -uplow option
  # - delete empty sequence entries / redundant gaps, nice margins
  # - mask end gaps
  unless ($opt{-Low2Up} = $opt{-uplow}) {
    foreach (@$pAln) {
      if ($_->{$KeySeq} =~ m/[A-Z]/) {
        $opt{-LowHide} = 1;
        last;
      }
    }
  }
  $pAln = &AlnClone ($pAln, -LowHide=>$opt{-LowHide}, -Low2Up=>$opt{-Low2Up});
  unless (int @$pAln) { return undef }
  # length of ungapped consensus sequence
  # this needs to be done before &_AlnMaskgapEnd
  if ($opt{-LenRef}) {
    $LenRef = length (&SeqStrPure ($$pAln[0]{$KeySeq}));
  }
  &AlnCompress ($pAln, -debug=>$dbg2);
  &_AlnMaskgapEnd ($pAln, -debug=>$dbg2);

  # analyse sequence identities
  $LenBlock = length ($$pAln[0]{$KeySeq});
  for (my $CtI=0; $CtI<$LenBlock; $CtI++) {

    # reinitialise counter
    undef %CtBase;

    # evaluate base frequency at particular position
    foreach (@$pAln) {
      $CtBase{substr($_->{$KeySeq}, $CtI, 1)} ++;
    }
    delete $CtBase{$SeqSmbMask};
    delete $CtBase{''};
    if ($MatchUnk and ! $CtBase{'-'} and keys(%CtBase) > 1) {
      $CtTemp = $CtBase{$MatchUnk};
      delete $CtBase{$MatchUnk};
      $CtBase{(keys(%CtBase))[0]} += $CtTemp;
    }

    # differences
    if (int keys(%CtBase) > 1) {
      $CtLen ++;
      next;
    }

    # only one sequence or identity over many sequences?
    if ($CtBase{(keys(%CtBase))[0]} <= 1) {
      next;
    } else {
      $CtLen ++;
      $CtIdent ++;
    }
  }

  # final calculation, exit SUB
  $LenRef = $opt{-LenRef} ? $LenRef : $CtLen;
  return $LenRef ? ($CtIdent/$LenRef) : undef;
}


# calculate symbol diversity in an alignment
#
# INTERFACE
# - argument 1:  reference to alignment data structure
#                data is left unchanged
#
# - options:
#   ...          all options of: &AlnConsens, &AlnTabConsDev
#   -ConsLen     supply pre-calculated value for consensus length
#   -ConsDev     supply pre-calculated plot of consensus deviations
#   -debug       [STD]
#   -KeyId       [STD]. This is only needed to identify an existing
#                consensus sequence entry. => &AlnConsens
#   -MaskGapEnd  [STD] => &AlnConsens, &AlnTabConsDev
#   -MaskGapIns  [STD] => &AlnConsens, &AlnTabConsDev
#
# - return val:  - pi value
#                - undef if an error occurred
#
# DESCRIPTION
# - the calculation of value pi is based on:
#   Nei, M., W.-H. Li. Mathematical model for studying genetic variation
#   in terms of restriction endonucleases.
#   Proc. Natl. Acad. Sci. USA 76, 5269-5273 (1979).
#
sub AlnDiversity {
  my ($pAln, %opt) = @_;
  my ($debug, $dbg2, $AlnConsLen, $pTabConsDev);
  my ($pPolym, $CtPos, $diversity, $CtI, $CtJ);

  # function parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  unless (int @$pAln) { return undef }

  # get consensus length, plot of consensus deviations
  $AlnConsLen  = $opt{-ConsLen} || length (scalar &AlnConsens ($pAln, %opt));
  $pTabConsDev = $opt{-ConsDev} || (scalar &AlnTabConsDev ($pAln, %opt));

  # each polymorphic site shows at least the consensus emission which's
  # not included in the ConsDev plot.
  $pPolym = {};
  foreach (@$pTabConsDev) {
    push @{$$pPolym{$_->{pos}}}, $_;
  }

  # calculate symbol diversity
  foreach $CtPos (sort { $a <=> $b } keys %$pPolym) {
    push @{$$pPolym{$CtPos}}, {
      emit => 'CONS',
      freq => 1 - &Sum (map { $_->{freq} } @{$$pPolym{$CtPos}}),
      };
    foreach $CtI (@{$$pPolym{$CtPos}}) {
      foreach $CtJ (@{$$pPolym{$CtPos}}) {
        $diversity += ($CtI == $CtJ) ? 0 : $CtI->{freq} * $CtJ->{freq};
      }
    }
  }
  $diversity /= $AlnConsLen;

  # exit SUB
  return $diversity;
}


# work out consensus from alignment
#
# INTERFACE
# - argument 1: reference to alignment data structure
#               referenced data is left unchanged
#
# - options:
#   -debug      [STD]
#   -KeyId      [STD]. This is only needed to identify an existing
#               consensus sequence entry. => &AlnTabSmbFreq
#   -KeySeq     [STD]
#   -MaskGapEnd [STD] => &AlnTabSmbFreq
#   -MaskGapIns [STD] => &AlnTabSmbFreq
#   -SmbUnk     [STD] => &AlnTabSmbFreq
#   -uplow      regard upper and lower letters in the same way, lower case
#               temporarily changed to upper case. As a default lower-case
#               letters are disregarded in the frequency statistics.
#               => &AlnTabSmbFreq
#
# - return val: - single value:
#                 - consensus sequence string, including gap marks. Consensus
#                   sequence has the same length as the maximum sequence length
#                   of all alignment entries.
#               - wantarray:
#                 - consensus sequence string, including gap marks. Consensus
#                   sequence has the same length as the maximum sequence length
#                   of all alignment entries.
#                 - plot of base frequencies (array reference) as given by
#                   &AlnTabSmbFreq.
#                 - array of observed symbols (reference) as returned by
#                   &AlnTabSmbFreq.
#               - undef if an error occurred (no alignment entries at all, ...)
#
# DESCRIPTION
# - The real work of determining the consensus sequence is done in
#   &AlnTabSmbFreq
#
sub AlnConsens {
  my ($pAln, %opt) = @_;
  my ($debug, $KeyId);
  my ($pSmbFreq, $pSmb, $AlnCons);
  my ($CtI);

  # function parameters
  $debug = $opt{-debug};
  $KeyId = $opt{-KeyId} || $_LibParam{KeyId};
  $debug and printf STDERR "%s. alignment IDs: %s%s\n", &MySub,
    join (' ', grep { defined $_ } map { $_->{$KeyId} } @{$pAln}[0..4]),
    (@$pAln > 5) ? ' ...':'';

  # get symbol frequency plot for alignment
  ($pSmbFreq, $pSmb) = &AlnTabSmbFreq ($pAln, %opt);
  unless (int @$pSmbFreq) { return undef }
  if ($debug) {
    printf STDERR "%s. symbol frequency plot as prepared by \&AlnTabSmbFreq\n", &MySub;
    printf STDERR '%6s' . '%4s' x (@$pSmb+0)  . '%9s'. '%6s' . '%9s' . '%8s' . "\n",
      'pos', @$pSmb, 'LettFreq', 'cover', 'ConsLett', 'consens';
    print  STDERR '-' x (6 + 4 * @$pSmb + 9 + 6 + 9 + 8), "\n";
    for ($CtI=0; $CtI<@$pSmbFreq; $CtI++) {
      printf STDERR '%6s' . '%4s' x (@$pSmb+0) . '%9s' . '%6s' . '%9s' . '%8s' . "\n",
        $CtI+1, @{$$pSmbFreq[$CtI]}{@$pSmb},
        $$pSmbFreq[$CtI]{LettFreq}, $$pSmbFreq[$CtI]{cover},
        $$pSmbFreq[$CtI]{ConsLett}, $$pSmbFreq[$CtI]{consens};
    }
  }

  # join most frequent letters to consensus sequence string
  $AlnCons = join ('', map { $_->{consens} } @$pSmbFreq);

  # exit SUB
  return wantarray ? ($AlnCons, $pSmbFreq, $pSmb) : $AlnCons;
}


################################################################################
# analysis resulting in complex data structure
################################################################################


# table of cross-wise distances between sequences in alignment
#
# INTERFACE
# - argument 1:  reference to alignment data structure
#                referenced data is left unchanged
#
# - options:
#   -debug       [STD]
#   -KeyId       [STD]
#   -KeySeq      [STD]
#   -MaskGapEnd  [STD]. *** implement me ***
#   -MaskGapIns  [STD]
#   -ResolveUnk  resolve 'unknown' letters N/X to the most frequent letter
#                of the alignment column (done in &AlnResolveUnk).
#   -SmbUnk      [STD]
#
# - return val:  - reference to distance table
#                - undef if an error occurred
#
# DESCRIPTION
# - upper and lower letters are regarded in the same way, lower case
#   changed to upper case (in the working copy of the alignment).
# - NOTE: distance calculation makes a difference depending on if you
#   count gaps as emissions or not.
#   If gaps are not masked, gap emissions shared by a pair of sequences
#   won't be counted as an emission position. The explanation is:
#   The distance observed with a pair of sequences must have a defined
#   value independent of any feature outside the pair of sequences. A gap,
#   that's introduced in both of a pair of sequences is not a feature
#   of the sequences themselves, but an insertion feature of any other
#   sequence in the alignment.
#
# DEBUG, CHANGES, ADDITONS
# - what's the basic philosophy of masking the gaps? Once again, think about it!
#
sub AlnTabDist {
  my ($pAln, %opt) = @_;
  my ($debug, $dbg2, $RegexpSmbNone);
  my ($AlnSeqType, $AlnLen);
  my ($pLabel, %TreeDist, %TreeDistAdv);
  my (@smb, $CtI, $CtJ, $CtPos, $CtEmit, $CtDiff);

  # function parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;

  # working copy of alignment
  # - optionally resolve unknown symbol
  # - mask end gaps and inner gaps
  # - determine effective unknown symbol
  $pAln = &AlnClone ($pAln, -KeyStd=>1, -KeyId=>$opt{-KeyId}, -KeySeq=>$opt{-KeySeq}, -Low2Up=>1);
    # after call of &AlnClone(...,-KeyStd=>1) we don't have to care about -KeyId and -KeySeq anymore
  unless (int @$pAln) { return undef }
  $opt{-ResolveUnk} and &AlnResolveUnk ($pAln, -debug=>$dbg2);
  &_AlnMaskgapEnd ($pAln, -debug=>$dbg2);
  $opt{-MaskGapIns} and &_AlnMaskgapInside ($pAln, $opt{-MaskGapIns}, -debug=>$dbg2);
  $AlnSeqType = &SeqType ($$pAln[0]{sequence});
  $RegexpSmbNone = '['. ($opt{-SmbUnk} || $SeqSmbUnk{$AlnSeqType}) . $SeqSmbMask .']';

  # start-up resources
  $AlnLen = &Max (map { length $_->{sequence} } @$pAln);
  $debug and printf STDERR "%s. alignment:\n", &MySub;
  $debug and printf STDERR "  size: %d x %d\n", $AlnLen, int @$pAln;
  $debug and printf STDERR "  seq type: %s\n", $AlnSeqType;

  # loop over alignment entries - first pairwise partner
  for ($CtI=0; ($CtI+1)<@$pAln; $CtI++) {
    push @$pLabel, $$pAln[$CtI]{id};

    # loop over alignment entries - 2nd pairwise partner
    for ($CtJ=$CtI+1; $CtJ<@$pAln; $CtJ++) {
      $debug and printf STDERR "%s. comparing %s / %s\n", &MySub,
        $$pAln[$CtI]{id}, $$pAln[$CtJ]{id};

      # loop over alignment positions
      # computational counting philosophy
      $CtEmit = $CtDiff = 0;
      for ($CtPos=0; $CtPos<$AlnLen; $CtPos++) {
        @smb = (substr ($$pAln[$CtI]{sequence}, $CtPos, 1), substr ($$pAln[$CtJ]{sequence}, $CtPos, 1));

        # do we have an emission in both sequences?
        unless ($smb[0] and $smb[1]) {
          $debug and printf STDERR "%s. end of sequence for at least one of entries %s / %s, pos %d\n", &MySub,
            $$pAln[$CtI]{id}, $$pAln[$CtJ]{id}, $CtPos;
          last;
        }
        if (join ('', @smb) =~ m/$RegexpSmbNone/) {
          $debug and printf STDERR "%s. no emission ('%s') for at least one of entries %s / %s, pos %d\n", &MySub,
            $&, $$pAln[$CtI]{id}, $$pAln[$CtJ]{id}, $CtPos;
          next;
        }
        if ($smb[0] eq '-' and $smb[1] eq '-') {
          $debug and printf STDERR "%s. both entries %s / %s gapped, pos %d\n", &MySub,
            $$pAln[$CtI]{id}, $$pAln[$CtJ]{id}, $CtPos;
          next;
        }
        $CtEmit ++;

        # compare sequence emission at particular position to consensus emission
        # - deviation? => upcount differences
        if ($smb[0] ne $smb[1]) {
          $CtDiff ++;
        }
      }

      # calculate distance measure
      if ($debug) {
        printf STDERR "  shared emission positions: %d\n", $CtEmit;
        printf STDERR "  abs. distance: %d\n", $CtDiff;
      }
      $TreeDist{$$pAln[$CtI]{id}}{$$pAln[$CtJ]{id}} = $CtEmit ? ($CtDiff / $CtEmit) : '-';
      if (wantarray) {
        $TreeDistAdv{$$pAln[$CtI]{id}}{$$pAln[$CtJ]{id}} = {
          dist    => $CtEmit ? ($CtDiff / $CtEmit) : '-',
          Nshared => $CtEmit,
          Ndiff   => $CtDiff,
          };
      }
    }
  }
  push @$pLabel, $$pAln[$CtI]{id};

  # exit SUB
  $debug and printf STDERR "%s. pairwise distance table:\n%s", &MySub;
  $debug and &DataPrint(\%TreeDist,-handle=>\*STDERR,-space=>2);
  return wantarray ? (\%TreeDist, $pLabel, \%TreeDistAdv) : \%TreeDist;
}


1;
# $Id: Align.pm,v 1.12 2007/12/30 12:12:31 szafrans Exp $
