################################################################################
#
#  kPerl Sequence Laboratory
#  Library for Sequence Manipulation
#
#  copyright (c)
#    Karol Szafranski, 2006
#    Fritz Lipmann Institute Jena, Genome Analysis Group, 2006
#    Institute of Molecular Biotechnology Jena, Dept. Genome Analysis, 1998-2002
#    Karol Szafranski and Inst. Physiol. Chem., Univ. Dsseldorf, 1997-1998
#  author
#    Karol Szafranski, karol.szafranski@leibniz-fli.de
#
################################################################################
#
#  DESCRIPTION
#
# - for an understanding of the code here it is important to now about
#   sequence data structure definition as described in SeqLab::SeqFormat.pm.
#
# - individual description of functions at the beginning of the code blocks
#
################################################################################
#
#  FUNCTIONS, DATA
#
#   @EXPORT
#   %_LibParam
#
# - basics
#   %SeqSmb
#   %SeqSmbUnk
#   $SeqSmbMask
#   %SyntaxTranslNtdrc
#   %SyntaxTranslWobble
#
# - sequence string formatting, concatenation, fragmentation
#   &_SeqPosIndex
#   &_SeqPosRebuild
#   &SeqStrRevcompl
#   &SeqRevcompl
#   &SeqStrPure
#   &SeqPure
#   &SeqRangeGapped
#   &SeqConcat
#   &SeqCutArray
#   &SeqRange
#   &SeqClipUnk
#   &SeqCplxRange
#
# - sequence masking and annotation
#   $_LibParam{ThreshPoly}
#   &MaskPoly
#
# - sequence type & sequence symbols
#   %SeqTypeBasic
#   &SeqType
#   &SeqStrDna2Rna
#   %SeqSyntaxAa
#   &SeqStrAaLong
#   %_SeqSmbFreqTab
#   &SeqSmbFreq
#
# - nucleotide to protein translation & re-translation
#   %_TranslTab
#   &_TranslTabAccess
#   &TranslNt
#   &TranslProt
#   &TranslOrfMax
#   &TranslStartArray
#   &TranslStopArray
#
#
#  STD OPTIONS
#
#   -debug      print debug protocol to STDERR
#   -timer      print time-performance protocol to STDERR
#
################################################################################
#
#  DEBUG, CHANGES, ADDITIONS
#
# - look also for notes in the header of each function block
#
################################################################################

package SeqLab::SeqBench;

# includes
use strict; #use warnings;  # OK 20071128
use MainLib::Path qw(%CorePath);
use MainLib::Data;
use MainLib::Misc qw(&MySub);
use Math::kCalc qw(&Min &Max);
use Math::Range;

# symbol export
use Exporter qw(import);
our @EXPORT = qw (
  %SeqSmb %SeqSmbUnk $SeqSmbMask %SyntaxTranslNtdrc %SyntaxTranslWobble
  &SeqStrRevcompl &SeqRevcompl
    &SeqStrPure &SeqPure &SeqRangeGapped
    &SeqConcat &SeqCutArray &SeqRange &SeqClipUnk &SeqCplxRange
  &MaskPoly
  %SeqTypeBasic &SeqType &SeqStrDna2Rna &SeqStrAaLong &SeqSmbFreq
  &TranslNt &TranslProt &TranslOrfMax &TranslStartArray &TranslStopArray
  );

# package-wide constants and variables
my %_LibParam;


################################################################################
# basics
################################################################################


# sequence symbols
#
our %SeqSmb = (
  'all' => 'ABCDEFGHIKLMNPQRSTUVWXY',
  'DNA' => 'ACGT',
  'DNA-unk' => 'ACGTN',
  'RNA' => 'ACGU',
  'nt' => 'ACGTU',
  'nt-unk' => 'ACGTUN',
  'nt-wobble' => 'ACGTUBDHKMNRSVWY',
  'prot' => 'ACDEFGHIKLMNPQRSTUVWY',
  'prot-unk' => 'ABCDEFGHIKLMNPQRSTUVWYXZ',
  );


# standard syntax for unknowns
#
our %SeqSmbUnk = (
  DNA        => 'N',
  RNA        => 'N',
  nucleotide => 'N',
  protein    => 'X',
  );
our $SeqSmbMask = 'Z';


# nucleotide direction encoding
#
our (%SyntaxTranslNtdrc);
$SyntaxTranslNtdrc{'End-PrimeNum'} = [
  [ '-1', '5' ],
  [  '1', '3' ],
  [ '+1', '3' ],
  ];
$SyntaxTranslNtdrc{End2PrimeNum} = {
  map { ($_->[0] => $_->[1]); } @{$SyntaxTranslNtdrc{'End-PrimeNum'}}
  };
$SyntaxTranslNtdrc{End2Prime} = {
  map { ($_->[0] => "$_->[1]'"); } @{$SyntaxTranslNtdrc{'End-PrimeNum'}}
  };


# IUPAC syntax for wobble bases
#
# DESCRIPTION
# - keep bases sorted in symbol arrays or concatenated strings!
#
our (%SyntaxTranslWobble);
$SyntaxTranslWobble{'Iupac-Bases'} = [
  [ 'A', 'A' ],
  [ 'C', 'C' ],
  [ 'G', 'G' ],
  [ 'T', 'T' ],
  [ 'B', 'CGT' ],
  [ 'D', 'AGT' ],
  [ 'H', 'ACT' ],
  [ 'K', 'GT' ],
  [ 'M', 'AC' ],
  [ 'N', 'ACGT' ],
  [ 'R', 'AG' ],
  [ 'S', 'CG' ],
  [ 'V', 'ACG' ],
  [ 'W', 'AT' ],
  [ 'Y', 'CT' ],
  ];
$SyntaxTranslWobble{Iupac2Bases} = {
  map { ($_->[0] => [ split(//,$_->[1]) ]) }
    @{$SyntaxTranslWobble{'Iupac-Bases'}}
  };
$SyntaxTranslWobble{Bases2Iupac} = {
  map { ($_->[1] => $_->[0]) }
    @{$SyntaxTranslWobble{'Iupac-Bases'}}
  };


################################################################################
# sequence string formatting, concatenation, fragmentation
################################################################################


# make indices to positional data in sequence data structure
#
# INTERFACE
# - argument 1: reference to sequence data structure
#               referenced data is possibly changed
#               see &_SeqPosRebuild
#
# - options:
#   -debug      [STD]
#
# - return val: array of references to positional values inside
#               sequence data structure
#
sub _SeqPosIndex {
  my ($pSeq,%opt) = @_;
  my $debug = $opt{-debug};

  # sample positional data from annotations
  my @index = map { (
    (exists $_->{offset}) ? \$_->{offset} : (),
    (exists $_->{end}) ? \$_->{end} : (),
  ); } @{$$pSeq{annot}};
  my @RangeSplit = map {
    $_->{range} = [ split (/(\d+)/, $_->{range}) ];
    $_->{range};
  } grep { $_->{range} } @{$$pSeq{annot}};
  foreach my $pRange (@RangeSplit) {
    printf STDERR "%s. splitted range: %s\n", &MySub, join ('|', @$pRange);
    for (my $CtI=0; $CtI<@$pRange; $CtI++) {
      $$pRange[$CtI] =~ /^\d+$/ and push @index, \$$pRange[$CtI];
    }
  }

  # sample extra positional data
  # LE wasn't read into the data structure
  foreach (grep { $$pSeq{$_} } qw(SL SR QL QR)) {
    push @index, \$$pSeq{$_};
  }

  # exit SUB
  $debug and printf STDERR "%s. got index values: %s\n", &MySub,
    join (' ', map { $$_ } @index) || '()';
  return @index;
}


# rebuild annotation range entries after split by &_SeqPosIndex
#
# INTERFACE
# - argument 1: reference to sequence data structure
#               referenced data is explicitly changed
#
sub _SeqPosRebuild {
  my ($pSeq) = @_;
  my ($pAnnot);

  # sample range entries in annotations
  foreach $pAnnot (grep { $_->{range} } @{$$pSeq{annot}}) {
    if (ref($$pAnnot{range}) eq 'ARRAY') {
      $$pAnnot{range} = join ('', @{$$pAnnot{range}});
    }
  }
}


# return reverse-complement instance of sequence
#
# INTERFACE
# - argument 1: source sequence string
# - return val: formatted sequence
#
# DESCRIPTION
# - parentheses syntax (() and []) is corrected
# - degenerate coding is supported (complemented)
# - RNA coding is supported (U complemented to A)
#
sub SeqStrRevcompl {
  my ($sSeq) = @_;

  # reverse sequence
  $sSeq = reverse $sSeq;

  # do standard complementation
  # including RNA -> DNA
  $sSeq =~ tr/abcdghkmnrstuvwyABCDGHKMNRSTUVWY/tvghcdmknysaabwrTVGHCDMKNYSAABWR/;

  # correct parentheses syntax
  $sSeq =~ tr#\]\[\)\(#\[\]\(\)#;

  return $sSeq;
}


# return reverse-complement instance of sequence data structure
#
# INTERFACE
# - argument 1: source sequence entry
#
# - options:
#   -debug      [STD]
#   -TrackPos   prepare array which map resulting sequence string positions
#               to the former (original) sequence string positions. Array
#               position 0 refers to resulting sequence string position 1
#               and so on.
#
# - return val: sequence entry
#
# DESCRIPTION
# - parentheses syntax (sequence may be a motif with () and []) is corrected
# - degenerate coding is supported (complemented)
# - RNA coding is supported (U complemented to A)
# - annotations outside the sequence range are deleted. In theory, this
#   module allows annotation-only sequence entries.
#
# DEBUG, CHANGES, ADDITIONS
# - complex range annotations cannot be handled and will be deleted.
#
sub SeqRevcompl {
  my ($pSeq,%opt) = @_;
  my $debug = $opt{-debug};

  # copy & nice seq data structure
  $pSeq = &DataClone ($pSeq);  # BEWARE: annotation data may be cross-linked in future code versions
  delete $$pSeq{SeqPure};

  # reverse-complement sequence string
  # - track seq positions during procedure?
  if ($opt{-TrackPos}) {
    $$pSeq{PosArr} ||= [1 .. (length $$pSeq{sequence})];
  } else {
    delete $$pSeq{PosArr};
  }
  $$pSeq{sequence} = &SeqStrRevcompl ($$pSeq{sequence});
  if ($opt{-TrackPos}) {
    @{$$pSeq{PosArr}} = reverse @{$$pSeq{PosArr}};
  }

  # apply reverse to positional fields (annotations) of sequence data
  my $iSeqLen = length $$pSeq{sequence};
  # have to delete complex range annotations
  @{$$pSeq{annot}} = grep { ! exists $_->{range} or ! $_->{range} } @{$$pSeq{annot}};
  my @AnnotPosIdx = &_SeqPosIndex ($pSeq, -debug=>$debug);
  foreach (@AnnotPosIdx) {
    if ($$_<1 or $$_>$iSeqLen) {
      $$_ = undef;
    } else {
      $$_ = $iSeqLen - $$_ + 1;
    }
  }

  # reverse order of offset/end fields in annotations
  foreach my $ItTagType ('S', 'Q') {
    ($$pSeq{$ItTagType.'L'}, $$pSeq{$ItTagType.'R'}) =
      ($$pSeq{$ItTagType.'R'}, $$pSeq{$ItTagType.'L'});
    $$pSeq{$ItTagType.'L'} or delete $$pSeq{$ItTagType.'L'};
    $$pSeq{$ItTagType.'R'} or delete $$pSeq{$ItTagType.'R'};
  }
  for (my $i=0; $i<int(@{$$pSeq{annot}}); ++$i) {
    my $pAnnot = $$pSeq{annot}[$i];
    ($pAnnot->{offset}, $pAnnot->{end}) = ($pAnnot->{end}, $pAnnot->{offset});
    unless ($pAnnot->{offset} and $pAnnot->{end}) {
      splice @{$$pSeq{annot}}, $i, 1;
      -- $i;
      next;
    }
    $pAnnot->{orient} *= -1;
  }

  # re-sort annotation list
  @{$$pSeq{annot}} = sort {
    $$a{offset} <=> $$b{offset} or
       $$b{end} <=> $$a{end} or
               $a <=> $b;
    } @{$$pSeq{annot}};

  # modify sequence description entry
  $$pSeq{descr} =~ s/^\s*$//;
  $$pSeq{descr} .= ($$pSeq{descr} ? ', ':'') . 'reverse-complement';
  delete $$pSeq{header};

  # return successfully
  return $pSeq;
}


# purify sequence strings for sequence-coding letters
#
# INTERFACE
# - argument 1: raw sequence string
#
# - options:
#   -lower      force sequence string to lower case letters
#   -SeqType    specifies the sequence type (default: none). This will lead
#               to an appropriate coding alphabet for conversion of fuzzy chars.
#               DNA      do U->T translation
#               DNA5     like DNA, but do not allow wobble codes (IUPAC)
#               protein  ...
#               RNA      do T->U translation
#               RNA5     like RNA, but do not allow wobble codes (IUPAC)
#   -upper      force sequence string to upper case letters
#
# - return val: - wantscalar: pure sequence string
#               - wantarray: pure sequence string, number of non-
#                 sequence-coding characters that have been removed.
#
# DESCRIPTION
# - non-standard sequence-coding characters are deleted
# - fuzzy letters are converted to official 'unknowns'.
#   Set option -SeqType.
#
sub SeqStrPure {
  my ($sSeq,%opt) = @_;
  my ($CtDel);

  # lower/upper case
  # tr// works faster than an 'uc'
  $opt{-lower} and $sSeq =~ tr/A-Z/a-z/;
  $opt{-upper} and $sSeq =~ tr/a-z/A-Z/;

  # leave all standard sequence coding letters
  $CtDel = ($sSeq =~ tr/abcdefghiklmnpqrstuvwxyABCDEFGHIKLMNPQRSTUVWXY//cd);
    # coordinate this with definition as: lc ($SeqSmb{all}) . uc ($SeqSmb{all})
    # elsewhere in this code

  # change fuzzy letters according to specified sequence type
  if ($opt{-SeqType}) {
    if (0) {
    } elsif ($opt{-SeqType} eq 'DNA') {
      $sSeq =~ tr/uU/tT/;
      $sSeq =~ tr/a-zABCDGHKMNRSTVWY/N/c;
      $sSeq =~ tr/abcdghkmnrstvwyA-Z/n/c;
    } elsif ($opt{-SeqType} eq 'DNA5') {
      $sSeq =~ tr/uU/tT/;
      $sSeq =~ tr/a-zACGNT/N/c;
      $sSeq =~ tr/acgntA-Z/n/c;
    } elsif ($opt{-SeqType} eq 'protein') {
      $sSeq =~ tr/a-zACDEFGHIKLMNPQRSTVWXY/X/c;
      $sSeq =~ tr/acdefghiklmnpqrstvwxyA-Z/x/c;
    } elsif ($opt{-SeqType} eq 'RNA') {
      $sSeq =~ tr/tT/uU/;
      $sSeq =~ tr/a-zABCDGHKMNRSUVWY/N/c;
      $sSeq =~ tr/abcdghkmnrsuvwyA-Z/n/c;
    } elsif ($opt{-SeqType} eq 'RNA5') {
      $sSeq =~ tr/tT/uU/;
      $sSeq =~ tr/a-zACGNU/N/c;
      $sSeq =~ tr/acgnuA-Z/n/c;
    }
  }

  # return
  return wantarray ? ($sSeq,$CtDel) : $sSeq;
}


# purify sequence string in sequence data structure accounting for annotations
#
# INTERFACE
# - argument 1: reference to sequence data structure
#               sequence entry is left unchanged
#
# - options:
#   -debug      [STD]
#   -SeqType    specifies the sequence type (default: none). This will lead
#               to an appropriate coding alphabet for conversion of fuzzy chars.
#               DNA      do U->T translation
#               DNA5     like DNA, but do not allow wobble codes (IUPAC)
#               protein  ...
#               RNA      do T->U translation
#               RNA5     like RNA, but do not allow wobble codes (IUPAC)
#
# - return val: - wantscalar: reference to sequence entry
#               - wantarray: reference to sequence entry, number of non-
#                 sequence-coding characters that have been removed.
#               - undef if an error occurred
#
# DESCRIPTION
# - see &_SeqPosIndex for the fields that are updated correctly.
# - upper/lower case appearance remains unchanged
#
# DEBUG, CHANGES, ADDITIONS
# - currently, some fields are not adjusted to the purified sequence:
#   AV, ON. Because they loose their informative value, they are just
#   deleted.
# - options:
#   -TrackPos   prepare array which map resulting sequence string positions
#               to the former (original) sequence string positions. Array
#               position 0 refers to resulting sequence string position 1
#               and so on.
#
sub SeqPure {
  my ($pSeq,%opt) = @_;
  my $debug = $opt{-debug};
  my $CharCode = lc($SeqSmb{all}) . uc($SeqSmb{all});

  # anything to do here?
  $pSeq = &DataClone ($pSeq);
  delete $$pSeq{SeqPure};
  $debug and printf STDERR "%s. purifying sequence %s, alphabet %s\n", &MySub,
    $$pSeq{id}||"''", $opt{-SeqType}||'NONE';

  # have to remove any gaps here?
  if ($$pSeq{sequence} =~ m/[^$CharCode]+/o) {

    # position-referring fields that won't contain proper information after this
    #   procedure
    foreach (qw(AV ON)) { delete $$pSeq{$_} }

    # get array of references to positional values
    my @PosIndex = &_SeqPosIndex ($pSeq, -debug=>$debug);

    # scan for gaps
    while ($$pSeq{sequence} =~ m/[^$CharCode]+/o) {
      my $pos = length $`;
      my $len = length $&;

      # check for positional change
      # positions are shifted either:
      # - if following the gap: leftwards for the size of the gap
      # - if inside the gap: leftwards to the next position following the gap
      foreach (grep { $$_ > $pos } @PosIndex) {
        $$_ -= &Min ($len, $$_ - $pos);
      }

      # delete gap in sequence string
      $$pSeq{sequence} = $` . $';
    }
  }

  # apply alphabet
  $$pSeq{sequence} = &SeqStrPure ($$pSeq{sequence}, -SeqType=>$opt{-SeqType});

  # rebuild positions and debug
  &_SeqPosRebuild ($pSeq, -debug=>$debug);
  if ($debug) {
    &_SeqPosIndex ($pSeq, -debug=>1);
    &_SeqPosRebuild ($pSeq, -debug=>0);
  }

  # exit SUB
  return wantarray ? ($pSeq,0) : $pSeq;
}


# map range data from gap-free sequence string to gapped sequence string
#
# INTERFACE
# - argument 1: sequence string purified
# - argument 2: sequence string gapped
# - argument 3: reference to range object (or reference to data structure
#               which may be used to create a range object) referring to the
#               ungapped sequence.
#
# - options:
#   -debug      [STD]
#
# - return val: - reference to range object which is equivalent to
#                 hash ref { '-1'=>$PosLeft, '1'=>$PosRight }
#               - undef if an error occurred
#
# DESCRIPTION
# - the function selects the minimal range on the gapped sequence
#   i.e. the range won't contain gaps adjacent to the borders.
#
sub SeqRangeGapped {
  my ($sSeqPure, $sSeqOrig, $pRange, %opt) = @_;
  my $debug = $opt{-debug};
  my $CharCode = lc($SeqSmb{all}) . uc($SeqSmb{all});
  $pRange = Math::Range->new($pRange) or return undef;

  # anything to do here?
  if ($sSeqOrig !~ m/[^$CharCode]+/o) {
    return $pRange;
  }

  # scan for gaps
  my $CtGap=0;
  while (
    $sSeqOrig =~ m/[^$CharCode]/og and
    pos($sSeqOrig)<=($pRange->lower()+$CtGap)
  ) {
    ++ $CtGap;
  }
  $pRange->lower ($pRange->lower()+$CtGap);
  pos($sSeqOrig) = $pRange->lower();
  while (
    $sSeqOrig =~ m/[^$CharCode]/og and
    pos($sSeqOrig)<=($pRange->upper()+$CtGap)
  ) {
    ++ $CtGap;
  }
  $pRange->upper ($pRange->upper()+$CtGap);

  # exit SUB
  return $pRange;
}


# join two sequence data structures
#
# INTERFACE
# - argument 1: left sequence entry
# - argument 2: rigth sequence entry
#
# - options:
#   -debug      [STD]
#   -spacer     insert a spacer of Ns between the joint sequences, default:
#               do not
#
# - return val: joint sequence entry
#
# DESCRIPTION
# - source sequence entries are left unchanged.
#
sub SeqConcat {
  my ($pSeqLeft,$pSeqRight,%opt) = @_;
  my $debug = $opt{-debug};
  my $iSeqLen = length ($$pSeqLeft{sequence}) + $opt{-spacer};
  my $pSeq = &DataClone ($pSeqLeft);
  delete $$pSeq{SeqPure};
  my $pSeqAdd = &DataClone ($pSeqRight);

  # join sequence strings
  $$pSeq{sequence} .= ($SeqSmbUnk{&SeqType($$pSeq{sequence})} x $opt{-spacer})
    . $$pSeqAdd{sequence};

  # delete some positional fields (annotations) of both sequence entries
  foreach my $ItTagType (qw(SL SR QL QR)) {
    delete $$pSeq{$ItTagType};
    delete $$pSeqAdd{$ItTagType};
  }
  # shift positional fields (annotations) of right sequence entry data
  my @PosIndex = &_SeqPosIndex ($pSeqAdd, -debug=>$debug);
  foreach (@PosIndex) { $$_+=$iSeqLen }
  &_SeqPosRebuild ($pSeqAdd, -debug=>$debug);
  if ($debug) {
    &_SeqPosIndex ($pSeqAdd, -debug=>1);
    &_SeqPosRebuild ($pSeqAdd, -debug=>0);
  }
  push @{$$pSeq{annot}}, @{$$pSeqAdd{annot}};

  # modify sequence description entry
  delete $$pSeq{descr};
  delete $$pSeq{header};

  # return successfully
  return $pSeq;
}


# cut at array of positions
#
# INTERFACE
# - argument 1: reference to source sequence entry
#               sequence entry is left unchanged
# - argument 2: reference to array of cut positions. The array is left
#               unchanged.
#               - cut positions refer to the left position of the next fragment
#                 (biological counting).
#               - option -olap interferes with the resulting fragment length!
#
# - options:
#   -debug      [STD]
#   -olap       size of fragment overlap (default: none)
#               cut positions refer to the leftmost position of the overlap
#               region.
#   -PosPure    range positions refer to the pure sequence letter string,
#               default: refer to positions in (possibly) gapped sequence
#               *** not implemented ***
#   -simple     cut quick and easy into simple sequence fragment data
#               structures that consist of id+descr+seq. This sparse some
#               computation time if the destination file format simple, i.e.
#               one of fastA, table etc., and the source format is complex, i.e.
#               one of GenBank, Experiment etc. However, if the input sequences
#               are already simply structured, then this function works quite
#               economically.
#
# - return val: - reference to sequence array of fragments
#               - undef if an error occurred
#
# DESCRIPTION
# - This function works quite economically, even for very large input.
#   If the input sequence data is rich in annotations and other accessory
#   information then use option -simple.
#
# DEVELOPER'S NOTES
# - sequence positions have base-oriented syntax, here.
#
sub SeqCutArray {
  my ($pSeq, $pCutPos, %opt) = @_;
  my ($debug, $iSeqLen, $LenOlap, $RangeOff, $RangeEnd);
  my ($pSeqNew, @SeqFrag);

  # function parameters
  $debug = $opt{-debug};
  $LenOlap = int $opt{-olap};
  $iSeqLen  = length $$pSeq{sequence};
  if ($debug) {
    printf STDERR "%s. parameters:\n", &MySub;
    printf STDERR "  sequence: %s\n", $$pSeq{id}||"''";
    printf STDERR "  array of positions: %s\n", join (', ', map { int($_) } @$pCutPos);
    printf STDERR "  fragment overlap size: %d\n", $LenOlap;
  }

  # loop over resulting ranges
  # positions in @$pCutPos are positions of offset
  $RangeOff = 1;
  foreach my $pos (@$pCutPos, $iSeqLen+1) {

    # complete current-step range
    $RangeEnd = &Min ($pos+$LenOlap-1, $iSeqLen);

    # apply range to sequence
    $pSeqNew = $opt{-simple} ? {
      id       => $$pSeq{id},
      descr    => $$pSeq{descr},
      sequence => substr($$pSeq{sequence},$RangeOff-1,$RangeEnd-$RangeOff+1),
      } : &SeqRange ($pSeq, $RangeOff, $RangeEnd, %opt);
    $$pSeqNew{id} .= sprintf ('.%d', int(@SeqFrag));
    push @SeqFrag, $pSeqNew;
    $debug and printf STDERR "%s. generated sequence fragment, ID %s\n", &MySub, $$pSeqNew{id};

    # prepare next step
    $RangeOff = $RangeEnd - $LenOlap + 1;
  }

  # return successfully
  return \@SeqFrag;
}


# return sequence sub-range
#
# INTERFACE
# - argument 1: reference to source sequence entry
#               sequence entry is left unchanged
# - argument 2: sequence range offset, default: 1
# - argument 3: sequence range end, default: sequence end position
#
# - options:
#   -debug      [STD]
#   -isPure     sequence string is provided in pure state
#   -PurePos    range positions refer to the pure sequence letter string,
#               default: refer to positions in (possibly) gapped sequence
#   -TrackPos   prepare array which map resulting sequence string positions
#               to the former (original) sequence string positions. Array
#               position 0 refers to resulting sequence string position 1
#               and so on.
#
# - return val: - reference to sequence entry (safe copy)
#               - undef if an error occurred
#
# DESCRIPTION
# - position numbering is treated in biological notation.
#   A negative position value will be evaluated in relation to the
#   sequence end rather than the sequence offset (for positive values).
#   1   refers to 1st position of sequence string
#   -1  refers to last position of sequence string
# - for gapped sequences in -PosPure mode both offset and end positions
#   will evaluate to sequence coding positions, not gaps.
# - in description entry of the sequence data structure a notion
#   on the range procedure will be appended.
#
# DEVELOPER'S NOTES
# - a function analogon on alignment level is &SeqAlign::AlnProj::AlnprojSplit
#
# DEBUG, CHANGES, ADDITIONS
# - apply range to annotation fields with complex range data field
#
sub SeqRange {
  my ($pSeq,$iRgSt,$iRgEnd,%opt) = @_;
  my $debug = $opt{-debug};
  my $CharCode = lc($SeqSmb{all}) . uc($SeqSmb{all});

  # default to range arguments
  my ($iSeqLen);
  if ($opt{-PurePos}) {
    if ($opt{-isPure}) {
      $$pSeq{SeqPure} ||= $$pSeq{sequence};
    }
    $iSeqLen = length ($$pSeq{SeqPure}||=&SeqStrPure($$pSeq{sequence},-upper=>1));
  } else {
    $iSeqLen = length ($$pSeq{sequence});
  }

  # check for validity of range parameters
  # we don't use the Math::Range object cause it would produce errors with negative
  #   position parameters (which have a special meaning here)
  $iRgSt ||= 1;
  $iRgEnd ||= $iSeqLen;
  if ($iRgSt<0) { $iRgSt += $iSeqLen + 1 }
  if ($iRgEnd<0) { $iRgEnd += $iSeqLen + 1 }
  if ($iRgSt>$iRgEnd or $iRgSt<1 or $iRgEnd>$iSeqLen) {
    printf STDERR "%s. ERROR: undefined cut range for sequence %s\n", &MySub, $$pSeq{id}||"''";
    printf STDERR "  range start..end: %d..%d, seq length: %d\n", $iRgSt, $iRgEnd, $iSeqLen;
    return undef;
  }
  if ($iRgSt==1 and $iRgEnd==$iSeqLen) { return $pSeq }

  # translate range parameters in -PurePos mode
  if ($opt{-PurePos}) {
    my $i = $iRgSt - 1;
    my $j = $iRgEnd - $iRgSt + 1 - 1;
    unless ($$pSeq{sequence} =~ m/^(([^$CharCode]*([$CharCode][^$CharCode]*){$i})[$CharCode]([^$CharCode]*[$CharCode]){$j})/) {
      printf STDERR "%s. ERROR: regexp match failed in -PurePos mode, seq %s\n", &MySub, $$pSeq{id}||"''";
      printf STDERR "  range start..end: %d..%d, seq length orig./pure: %d/%d\n", $iRgSt, $iRgEnd, length ($$pSeq{sequence}), $iSeqLen;
      printf STDERR "  sequence pre %d, sequence pre+range %d\n", $i, $j+1;
      return undef;
    }
    $iRgSt = length($2) + 1;
    $iRgEnd = length($1);
  }

  ##############################################################################
  # start building sub-sequence

  # clone sequence data structure
  my $pSeqNew={};
  foreach my $key (grep { $_!~/^(annot|AnnotGrp|header|PosArr|QL|QR|SL|SR|sequence|SeqPure)$/ } grep { defined($$pSeq{$_}) } keys %$pSeq) {
    $$pSeqNew{$key} = ref($$pSeq{$key}) ?
      &DataClone($$pSeq{$key}) : $$pSeq{$key};
  }

  # string formatting
  # work around bulk memory usage
  $$pSeqNew{sequence} = substr ($$pSeq{sequence}, $iRgSt-1, $iRgEnd-$iRgSt+1);

  # track original sequence positions, or start tracking
  if ($opt{-TrackPos}) {
    @{$$pSeqNew{PosArr}} = (exists($$pSeq{PosArr}) and $$pSeq{PosArr}) ?
      splice (@{$$pSeq{PosArr}}, $iRgSt-1, $iRgEnd-$iRgSt+1) : ($iRgSt..$iRgEnd);
  }

  ##############################################################################
  # apply range to positional fields (annotations) of sequence data

  # adjust positions to new range borders
  # - annotations with complex range specificatons will be deleted
  my ($pAnnotNew);
  foreach my $pAnnot (@{$$pSeq{annot}}) {
    if ($$pAnnot{range} or $$pAnnot{end}<$iRgSt or $$pAnnot{offset}>$iRgEnd) {
      next;
    }
    $pAnnotNew = &DataClone($pAnnot);
    if ($$pAnnotNew{offset} < $iRgSt) { $$pAnnotNew{offset} = $iRgSt }
    if ($$pAnnotNew{end} > $iRgEnd) { $$pAnnotNew{end} = $iRgEnd }
    push @{$$pSeqNew{annot}}, $pAnnotNew;
  }

  # move positional values to the left
  if ($iRgSt > 1) {
    foreach (&_SeqPosIndex ($pSeqNew, -debug=>$debug)) {
      $$_ -= $iRgSt - 1;
    }
  }

  ##############################################################################
  # post-work

  # modify sequence description entry
  $$pSeqNew{descr} =~ s/^\s*$//;
  $$pSeqNew{descr} .= ($$pSeqNew{descr} ? ', ':'')
    . sprintf ("range %d..%d", $iRgSt, $iRgEnd);

  # return successfully
  return $pSeqNew;
}


# cut off unknown symbols at sequence ends
#
# INTERFACE
# - argument 1: reference to sequence data structure
#               referenced data will be left unchanged
#
# - options:
#   -debug      [STD]
#   -SmbUnk     explicitly specify the symbol for unknown emissions.
#               Symbol will evaluated to both upper and lower case.
#
# - return val: reference to sequence data structure
#
# DESCRIPTION
# - this function will also remove gaps at the sequence ends
#
sub SeqClipUnk {
  my ($pSeq, %opt) = @_;
  my ($debug);
  my ($CharUnk, $CharCode, $RegexpUnk);
  my ($DelRight, $DelLeft, $iSeqLen);

  # function parameters
  $debug = $opt{-debug};

  ##############################################################################
  # determine effective known range

  # alphabet parameters
  $CharUnk = uc ($opt{-SmbUnk} || $SeqSmbUnk{&SeqType($$pSeq{sequence})});
  $CharCode = join ('', grep { $_ ne $CharUnk } split (//, uc ($SeqSmb{all})));
  $RegexpUnk = '[^'. lc ($CharCode) . uc ($CharCode) .']';
  $debug and printf STDERR "%s. clipping with regexp %s\n", &MySub, $RegexpUnk;

  # clip lengths
  $$pSeq{sequence} =~ m/^($RegexpUnk*)/;
  $DelLeft  = length $1;
  $$pSeq{sequence} =~ m/($RegexpUnk*)$/;
  $DelRight = length $1;
  $iSeqLen = length $$pSeq{sequence};
  $debug and printf STDERR "%s. clipping %d left, %d right, seq length %d\n", &MySub,
    $DelLeft, $DelRight, $iSeqLen;

  ##############################################################################
  # apply range to sequence entry

  # nothing to do
  if (!$DelLeft and !$DelRight) {
    return $pSeq;
  }

  # select true effective known range
  if ($DelLeft < $iSeqLen-$DelRight) {
    $pSeq = &SeqRange ($pSeq, $DelLeft+1, $iSeqLen-$DelRight, -debug=>$debug);
    $$pSeq{descr} =~ s/r(ange \d+\.\.\d+$)/KnownR$1/;
  }

  # NULL known range
  else {
    $debug and printf STDERR "%s. WARNING: NULL known range, seq %s\n", &MySub, $$pSeq{id}||"''";
    $pSeq = &DataClone ($pSeq);
    foreach (qw(sequence QL QR SL SR annot)) {
      delete $$pSeq{$_};
    }
    $$pSeq{descr} .= ($$pSeq{descr} ? ', ':'') . 'KnownRange NULL';
  }

  # exit SUB
  return $pSeq;
}


# return complex sequence sub-range
#
# INTERFACE
# - argument 1: reference to source sequence entry
#               sequence entry is left unchanged
# - argument 2: range command string
#               syntax as in GenBank feature table, extented for negative
#               position values (see &SeqRange)
#
# - options:
#   -debug      [STD]
#   -isPure     sequence string is provided in pure state
#               This switch takes effect in &SeqRange.
#   -PurePos    range positions refer to the pure sequence letter string,
#               default: refer to positions in (possibly) gapped sequence
#   -TrackPos   prepare array which map resulting sequence string positions
#               to the former (original) sequence string positions. Array
#               position 0 refers to resulting sequence string position 1
#               and so on.
#
# - return val: - reference to sequence entry
#               - undef if an error occurred
#
# DESCRIPTION
# - for details on position arguments see description of function &SeqRange
# - in description entry of the sequence data structure a notion
#   on the range procedure will be appended.
#
# DEBUG, CHANGES, ADDITIONS
# - apply range to annotation fields
#
sub SeqCplxRange {
  my ($pSeq, $RangeArg, %opt) = @_;
  my $debug = $opt{-debug};
  my $SeqDescr = $$pSeq{descr};
  $debug and printf STDERR "%s. sequence %s\n", &MySub, $$pSeq{id};

  # chain into appropriate atom action
  # do recursion to perform complex application of range argument
  if (0) { }

  # atom action complement()
  # this automatically prepares a safe copy on $pSeq
  elsif ($RangeArg =~ m/^complement\((.+)\)$/) {
    $debug and printf STDERR "%s. found atom action complement()\n", &MySub;
    unless ($pSeq = &SeqCplxRange($pSeq,$1,%opt)) { return undef }
    unless ($pSeq = &SeqRevcompl($pSeq,%opt)) { return undef }
  }

  # atom action join()
  # this automatically prepares a safe copy on $pSeq, finally (see end of block)
  elsif ($RangeArg =~ m/^join\((.+)\)$/) {
    $debug and printf STDERR "%s. found atom action join()\n", &MySub;
    my ($pSeqNext, $pSeqSub);
    foreach my $RangeNext (split (/,/, $1)) {
      unless ($pSeqSub = &SeqCplxRange($pSeq,$RangeNext,%opt)) { return undef }
      if ($pSeqNext) {
        $$pSeqNext{sequence} .= $$pSeqSub{sequence};
        if ($opt{-TrackPos}) {
          push @{$$pSeqNext{PosArr}}, @{$$pSeqSub{PosArr}};
        }
        push @{$$pSeqNext{annot}}, @{$$pSeqSub{annot}};
      } else {
        $pSeqNext = $pSeqSub;
      }
    }
    $pSeq = $pSeqNext;
  }

  # atom action x..y
  # this automatically prepares a safe copy on $pSeq
  elsif ($RangeArg =~ m/^[<>=]*(-?\d+)\.\.[<>=]*(-?\d+)$/) {
    # a negative position will be evaluated as located relative to seq
    # end (done in &SeqRange)
    $debug and printf STDERR "%s. found atom action %d..%d\n", &MySub, $1, $2;
    unless ($pSeq = &SeqRange($pSeq,$1,$2,%opt)) { return undef }
  }

  # unable to determine next atom action
  else {
    printf STDERR "%s. ERROR: unable to determine next atom action on range argument %s, sequence %s\n", &MySub,
      $RangeArg||"''", $$pSeq{id};
    return undef;  # <- important!
  }

  # modify sequence description entry
  # It's actually impossible to land here without having a subsequence transformation
  # taken place, cf. previous else{} block. So, it's safe here to modify %$pSeq.
  $SeqDescr =~ s/^\s*$//;
  $$pSeq{descr} = $SeqDescr . ($SeqDescr ? ', ':'') . "range $RangeArg";
  delete $$pSeq{header};

  # return successfully
  return $pSeq;
}


################################################################################
# sequence masking and annotation
################################################################################


# standard parameters for masking
$_LibParam{ThreshPoly} = 12;


# mask poly(X) stretches in a sequence
#
# INTERFACE
# - argument 1: source sequence string
#
# - options:
#   -debug      [STD]
#   -SmbMask    masking character, default: $SeqSmbMask
#   -ThreshPoly minimal length of poly(X) stretches
#   -timer      [STD]
#
# - return val: masked sequence string
#
sub MaskPoly {
  my ($sSeq,%opt) = @_;

  # function parameters
  my $debug = $opt{-debug};
  my $SmbMask = $opt{-SmbMask} || $SeqSmbMask;
  my $ThreshPoly = $opt{-ThreshPoly} || $_LibParam{ThreshPoly};
  $ThreshPoly --;  # applied as: number of extra characters of the same type
  my $bTimer = $opt{-timer}; my $time;

  # loop over instances of poly(X)
  $bTimer and $time = (times)[0];
  # i had segmentation faults with very long poly(N) stretches
  my $CtInst;
  while ($sSeq =~ m/([a-mo-wyzA-MO-WYZ])(-*\1){$ThreshPoly,}/g) {
    $CtInst ++;
    my $sSeqsubPre = $`;
    my $sSeqsubPoly = $&;
    my $sSeqsubPost = $';
    # the following is not really faster than:
    # $SeqPos = length($sSeqsubPre) + length($sSeqsubPoly)
    my $SeqPos = pos($sSeq);
    $debug and printf STDERR "%s. found poly(%s) stretch at position %d, length %d\n", &MySub,
      substr($sSeqsubPoly,0,1), length($sSeqsubPre), length($sSeqsubPoly);

    # replace string
    # - beware of a recursive call like:
    #   $sSeq = $sSeqsubPre . $sSeqsubPoly . &MaskPoly($sSeqsubPost,%opt);
    #   this needs enormous memory resources for long sequences
    $sSeqsubPoly =~ s/[a-zA-Z]/$SmbMask/g;
    $sSeq = $sSeqsubPre . $sSeqsubPoly . $sSeqsubPost;
    pos($sSeq) = $SeqPos;
  }

  # summary debug
  $debug and printf STDERR "%s. replaced a total of %d poly(X) stretches\n", &MySub, $CtInst;
  if ($CtInst and $bTimer) {
    $bTimer and printf STDERR "%s. CPU time - total %.3f, unit %.6f\n", &MySub,
      (times)[0]-$time, ((times)[0]-$time)/$CtInst;
  }

  # exit SUB successfully
  return $sSeq;
}


################################################################################
# sequence type & sequence symbols
################################################################################


# translate seq type to basic seq type
our %SeqTypeBasic = (
  protein    => 'protein',
  nucleotide => 'nucleotide',
  DNA        => 'nucleotide',
  RNA        => 'nucleotide',
  );

# determine type of given sequence
#
# INTERFACE
# - argument 1: sequence string
#
# - options:
#   -basic      boolean switch value. Only determine and return basic type:
#               nucleotide / protein
#   -debug      [STD]
#
# - return val: wantscalar ->
#               - sequence type: protein / nucleotide / DNA / RNA
#                 cmp. option -basic
#               wantarray -> array of:
#               - sequence type: protein / nucleotide / DNA / RNA
#                 cmp. option -basic
#               - degenerate flag (wobble nts or unknown aas)
#
sub SeqType {
  my ($sSeq,%opt) = @_;

  # function parameters
  my $debug = $opt{-debug};
  if (ref($sSeq)) {
    die sprintf "%s. code ERROR: string expected in arg1 but found ref %s\n", &MySub,
      ref($sSeq);
  }
  $debug and printf STDERR "%s. sequence string:\n%s\n", &MySub, $sSeq;

  # check for protein
  if ($sSeq =~ m/[EFILPQX]/i) {
    return wantarray ? ('protein',$sSeq=~m/X/i) : 'protein';
  }

  # basic type is enough?
  if ($opt{-basic}) {
    return wantarray ? ('nucleotide',$sSeq=~m/[BDHKMNRSVWY]/i) : 'nucleotide';
  }

  # check for RNA
  if ($sSeq =~ m/U/i) {
    return wantarray ? ('RNA',$sSeq=~m/[BDHKMNRSVWY]/i) : 'RNA';
  }
  # ... else it is DNA
  else {
    return wantarray ? ('DNA',$sSeq=~m/[BDHKMNRSVWY]/i) : 'DNA';
  }
}


# return RNA representation of nucleotide sequence
#
# INTERFACE
# - argument 1: nucleotide sequence string
# - return val: RNA sequence string
#
# DESCRIPTION
# - the only work that's done here is translation of T to U
# - degenerate coding won't be affected
#
sub SeqStrDna2Rna {
  my ($SeqStrRna) = @_;

  # convert symbols
  $SeqStrRna =~ tr/tT/uU/;

  # exit SUB
  return $SeqStrRna;
}


# return RNA representation of nucleotide sequence
our (%SeqSyntaxAa);
$SeqSyntaxAa{'short-long'} = [
  ['A', 'Ala'],
  ['C', 'Cys'],
  ['D', 'Asp'],
  ['E', 'Glu'],
  ['F', 'Phe'],
  ['G', 'Gly'],
  ['H', 'His'],
  ['I', 'Ile'],
  ['K', 'Lys'],
  ['L', 'Leu'],
  ['M', 'Met'],
  ['N', 'Asn'],
  ['P', 'Pro'],
  ['Q', 'Gln'],
  ['R', 'Arg'],
  ['S', 'Ser'],
  ['T', 'Thr'],
  ['V', 'Val'],
  ['W', 'Trp'],
  ['Y', 'Tyr'],
  ];
$SeqSyntaxAa{short2long} = {
  map { ($_->[0] => $_->[1]) } @{$SeqSyntaxAa{'short-long'}}
  };
$SeqSyntaxAa{long2short} = {
  map { ($_->[1] => $_->[0]) } @{$SeqSyntaxAa{'short-long'}}
  };

# return long syntax of protein sequence
#
# INTERFACE
# - argument 1: protein sequence string
# - return val: protein sequence string
#
sub SeqStrAaLong {
  my ($SeqStrAa) = @_;
  my ($SeqStrAaNew);

  # convert symbols
  foreach (split //, $SeqStrAa) {
    $SeqStrAaNew .= $SeqSyntaxAa{short2long}{$_};
  }
  $SeqStrAaNew =~ s/^-*//;

  # exit SUB
  return $SeqStrAaNew;
}


# sequence symbol frequency tables
$_LibParam{SrcSmbFreq} = {
  nucleotide => "$CorePath{call}{PerlScript}/SeqLab/SmbFreqNt.dat",
  protein    => "$CorePath{call}{PerlScript}/SeqLab/SmbFreqProt.dat",
  };

# sequence symbol frequency tables
our (%_SeqSmbFreqTab) = ();

# return sequence symbol frequency table
#
# INTERFACE
# - argument 1: sequence type
# - argument 2: sample descriptor (string)
# - return val: - reference to sequence symbol frequency table
#               - undef if an error occurs
#
sub SeqSmbFreq {
  my ($SeqType, $SeqSample) = @_;
  my ($pTab);

  # load table
  unless (exists $_SeqSmbFreqTab{$SeqType} and %{$_SeqSmbFreqTab{$SeqType}}) {
    unless ($_LibParam{SrcSmbFreq}{$SeqType}) {
      printf STDERR "%s. ERROR no symbol frequency table available for seq type %s\n", &MySub, $SeqType;
      return undef;
    }
    unless ($_SeqSmbFreqTab{$SeqType} = &DataRead ($_LibParam{SrcSmbFreq}{$SeqType}) and
            %{$_SeqSmbFreqTab{$SeqType}}) {
      printf STDERR "%s. ERROR in data-read of file %s\n", &MySub, $_LibParam{SrcSmbFreq}{$SeqType};
      return undef;
    }
  }

  # exit SUB
  return $_SeqSmbFreqTab{$SeqType}{$SeqSample};
}


################################################################################
# nucleotide to protein translation & re-translation
################################################################################


$_LibParam{SrcTranslTab} = "$CorePath{call}{PerlScript}/SeqLab/TranslTab.dat";

# codon translation table(s)
#
# DESCRIPTION
# - currently, only the standard translation table is supported ("SGC0" at NCBI)
# - RNA encoding is expected
# - table 'RNA2Protein' presents a codon to aa translation
#
my %_TranslTab = (
  RNA2Protein => [],
  Protein2DNA => [],
  );


# return translation table
#
# INTERFACE
# - options:
#   -debug      [STD]
#   -mode       translation mode:
#               RNA2Protein  default: translation table RNA -> protein
#               Protein2DNA  translation table protein -> DNA
#   -table      genetic code ID (numeric), according to NCBI/GenBank,
#               default: 1 = standard / universal.
#               translation table 0 is undefined
#
# - return val: - reference to translation table
#               - undef if an error occurs
#
# DESCRIPTION
# - if called the first time, the translation table is loade from
#   a source file.
#
sub _TranslTabAccess {
  my (%opt) = @_;
  my ($debug, $TableID, $TableMode);
  my (%ReTransl, $ItAa, $ItCodon, $CodonDNA, $PosCodon, $wobble);

  # function parameters
  $debug = $opt{-debug};
  $TableID = $opt{-table} || 1;
  $TableMode = $opt{-mode} || 'RNA2Protein';

  # RNA->protein translation table already present?
  unless (exists($_TranslTab{RNA2Protein}) and @{$_TranslTab{RNA2Protein}}) {

    # load RNA->protein translation table from source file
    unless ($_TranslTab{RNA2Protein} = &DataRead($_LibParam{SrcTranslTab})
      and @{$_TranslTab{RNA2Protein}}
    ) {
      printf STDERR "%s. ERROR in data-read of file %s\n", &MySub, $_LibParam{SrcTranslTab};
      return 0;
    }
  }

  # return / work out translation table of the requested mode
  if (0) {
  } elsif ($TableMode eq 'RNA2Protein') {
    return $_TranslTab{RNA2Protein}[$TableID];
  } elsif ($TableMode eq 'Protein2DNA') {
    unless (@{$_TranslTab{Protein2DNA}} and $_TranslTab{Protein2DNA}[$TableID]) {
      $_TranslTab{Protein2DNA}[$TableID] = {};
      foreach $ItCodon (grep { $_ !~ m/N/ }
               grep { length($_) == 3 }
               keys %{$_TranslTab{RNA2Protein}[$TableID]}) {
        $CodonDNA = $ItCodon;
        $CodonDNA =~ tr/uU/tT/;
        push @{$ReTransl{$_TranslTab{RNA2Protein}[$TableID]{$ItCodon}}}, $CodonDNA;
      }
      foreach $ItAa (keys %ReTransl) {
        foreach $PosCodon (0..2) {
          $wobble = join ('', sort (map { substr ($_, $PosCodon, 1) } @{$ReTransl{$ItAa}}));
          while ($wobble =~ s/([A-Z])\1/$1/g) { next }
          $_TranslTab{Protein2DNA}[$TableID]{$ItAa} .=
            $SyntaxTranslWobble{Bases2Iupac}{$wobble};
        }
      }
    }
    return $_TranslTab{Protein2DNA}[$TableID];
  } else {
    printf STDERR "%s. ERROR: requested translation mode not available: %s\n", &MySub, $TableMode;
    return undef;
  }
}


# return amino acid translation of nucleotide sequence
#
# INTERFACE
# - argument 1: source nucleotide sequence string
#
# - options:
#   -debug      [STD]
#   -frame      specify translation frame, possible:
#               '+1' (default), '+2', '+3', '-1', '-2', '-3'.
#               Numbering of negative Frames refers to the right end of the
#               input sequence.
#   -table      genetic code ID (numeric), according to NCBI/GenBank,
#               default: 1 = standard / universal.
#
# - return val: protein sequence string (may include stops as '*')
#
# DESCRIPTION
# - currently, only the standard translation table (#1) is supported
# - DNA and RNA coding is supported (bases T/U). In the translation
#   table, codons are defined using U.
# - The input sequence string is purified prior to the operation.
# - cmp. code in /gen/fly/biosw/bioperl/fisher/nt2aa.pl
#
sub TranslNt {

  # function parameters
  my ($sSeqNt,%opt) = @_;
  my $debug = $opt{-debug};
  my $pTranslTab = &_TranslTabAccess (-table=>$opt{-table}, -debug=>$opt{-debug});
  unless ($pTranslTab) {
    die sprintf "%s. ERROR: no translation table %d\n", &MySub, $opt{-table}||1;
  }
  my $frame = $opt{-frame} || '+1';

  # prior preparation of sequence/position data
  if ($frame<0) { $sSeqNt = &SeqStrRevcompl($sSeqNt) }
  my $sSeqRna = &SeqStrPure ($sSeqNt, -SeqType=>'RNA5', -upper=>1);
  my $iSeqLen = length $sSeqRna;
  my $pos = abs($frame)-1;
  if ($pos) { $sSeqRna = substr($sSeqRna,$pos) }

  # do translation
  # this is faster than a loop over fragments from split (/([a-zA-Z]{3})/,$sSeqRna)
  my $sSeqProt = '';
  for ($pos=0; $pos<$iSeqLen; $pos+=3) {
    my $ItCodon = substr ($sSeqRna, $pos, 3);
    if (length($ItCodon) < 3) { last }
    $sSeqProt .= $$pTranslTab{$ItCodon} || 'X';
  }

  # return successfully
  return $sSeqProt;
}


# return nucleotide translation of amino acid sequence
#
# INTERFACE
# - argument 1: source protein sequence string
#
# - options:
#   -debug      [STD]
#   -table      genetic code ID (numeric), according to NCBI/GenBank,
#               default: 1 = standard / universal.
#
# - return val: - nucleotide sequence string (will probably include undef symbols)
#               - undef if an error occurred
#
# DESCRIPTION
# - currently, only standard translation table (#1) is supported
# - cmp. code in /gen/fly/biosw/bioperl/fisher/nt2aa.pl
# - The input sequence string is purified prior to the operation.
#
sub TranslProt {
  my ($SeqProt,%opt) = @_;
  my $debug = $opt{-debug};
  my $pTranslTab = &_TranslTabAccess (-mode=>'Protein2DNA', -table=>$opt{-table}, -debug=>$opt{-debug});
  unless ($pTranslTab) {
    $debug and printf STDERR "%s. ERROR: no translation table %d\n", &MySub, $opt{-table}||1;
    return undef;
  }
  $SeqProt = &SeqStrPure ($SeqProt, -SeqType=>'protein', -upper=>1);

  # do translation
  my $SeqDNA;
  foreach my $aa (split (//, $SeqProt)) {
    $SeqDNA .= $$pTranslTab{$aa} || 'NNN';
  }

  # exit SUB successfully
  return $SeqDNA;
}


# return longest ORF found in a nucleotide sequence
#
# INTERFACE
# - argument 1: source nucleotide sequence string
#
# - options:
#   -debug      [STD]
#   -SlcFrame   specify a selection list of the translation frames (reference
#               array of frame specifiers)
#   -SlcStart   select starting amino acid by regexp, default: M
#               default: 1 = standard / universal.
#
# - return val: - reference to result data structure, hash with fields:
#                 frame     including sign reporting direction
#                 offset    position of first nt of start codon
#                 end       position of last nt of CDS (excluding stop codon)
#                 LengthNt  length of CDS excluding stop
#                 LengthP   length of protein
#                 SeqP      protein sequence string
#               - undef if an error occurred
#
# DESCRIPTION
# - both, DNA and RNA coding is supported (bases T/U). In the translation
#   table codons are defined using U.
# - The input sequence string is purified prior to the operation. Accordingly,
#   all positional information refers to the purified sequence string.
# - currently, only standard translation table (#1) is supported
#
sub TranslOrfMax {
  my ($SeqNt,%opt) = @_;
  $SeqNt or die sprintf "%s. ERROR: NULL sequence string\n", &MySub;
  my $debug = $opt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;
  my @frame = $opt{-SlcFrame} ? @{$opt{-SlcFrame}} : ( '+1', '+2', '+3', '-1', '-2', '-3' );
  if (grep { $_ == 0 } map { m/^[-+]?[1-3]$/ ? 1:0; } @frame) { return undef }
  my $RegexpStart = $opt{-SlcStart} || 'M';
  $debug and printf STDERR "%s. passed argument check\n  called by: %s\n  frames: %s\n",
    &MySub, (caller(1))[3], join (', ', @frame);

  # get codon array from sequence
  my %result;
  foreach my $ItFrame (@frame) {
    my $SeqProt = &TranslNt ($SeqNt, -frame=>$ItFrame, -debug=>$dbg2);
    while ($SeqProt =~ m/$RegexpStart\w+\b/g) {

      # keep first or next longer continuous protein sequence
      if (length($&) > $result{LengthP}) {
        %result = (
          frame    => $ItFrame,
          offset   => length($`) * 3 + abs($ItFrame),
          end      => (length($`) + length($&)) * 3 + abs($ItFrame) - 1,
          LengthNt => length($&) * 3,
          LengthP  => length($&),
          SeqP     => $&,
          );
        if ($ItFrame < 0) {
          map { $result{$_} = length($SeqNt) - $result{$_} + 1 } 'offset', 'end';
        }
      }

      # continue in protein sequence string
      pos($SeqProt) = length($`) + 1;
    }
  }

  # exit SUB successfully
  return \%result;
}


# return array of possible translation start sites in a nucleotide sequence
#
# INTERFACE
# - argument 1: source nucleotide sequence
#
# - options:
#   -debug      [STD]
#   -frame      specify translation frame, possible:
#               '+1' (default), '+2', '+3', '-1', '-2', '-3'.
#   -table      genetic code ID (numeric), according to NCBI/GenBank,
#               default: 1 = standard / universal.
#
# - return val: - array of translation start sites:
#                 - reference to position array, numerical order referring
#                   to plus stranded sequence
#                 - reference to codon strings (RNA coded)
#                 - reference to start status probabilities
#                 - complete protein sequence from frame offset on
#               - undef if an error occurred
#
# DESCRIPTION
# - both, DNA and RNA coding is supported (bases T/U). In the translation
#   table codons are defined using U.
# - only translation in one single frame is checked, cmp. option -frame
# - currently, only standard translation table (#1) is supported
#
sub TranslStartArray {
  my ($SeqNt,%opt) = @_;

  # function parameters
  my $debug = $opt{-debug};
  my $pTranslTab = &_TranslTabAccess (-table=>$opt{-table}, -debug=>$opt{-debug});
  unless ($pTranslTab) {
    $debug and printf STDERR "%s. ERROR: no translation table %d\n", &MySub, $opt{-table}||1;
    return undef;
  }
  my $frame = $opt{-frame} || '+1';
  if ($frame<0) { $SeqNt = &SeqStrRevcompl($SeqNt) }
  my $SeqRna = &SeqStrPure ($SeqNt,-SeqType=>'RNA5', -upper=>1);
  my $iSeqLen = length $SeqRna;
  my $StrPos = abs($frame) - 1;
  if ($debug) {
    printf STDERR "%s. starting with parameters:\n", &MySub;
    printf STDERR "  frame: %s\n", $frame||"''";
    printf STDERR "  seq length: %d\n", $iSeqLen;
    printf STDERR "  string pos: %d\n", $StrPos;
  }

  # loop over codon sequence
  my ($SeqTransl,$codon,$probab);
  my (@aPos,@aCodon,@aProbab);
  while ($codon = substr($SeqRna,$StrPos,3)) {
    if (length($codon)<3) { last }

    # build up protein sequence
    $SeqTransl .= $$pTranslTab{$codon} || 'X';

    # look for translation start positions
    if ($probab = $$pTranslTab{start}{$codon}) {
      my $SeqPos = ($frame<0) ? $iSeqLen-$StrPos : $StrPos+1;
      if ($debug) {
        printf STDERR "%s. found start codon:\n", &MySub;
        printf STDERR "  string position: %d\n", $StrPos;
        printf STDERR "  seq position: %d\n", $SeqPos;
        printf STDERR "  codon, aa: %s, %s\n", $codon, $$pTranslTab{$codon} || 'X';
      }
      push @aPos, $SeqPos;
      push @aCodon, $codon;
      push @aProbab, $probab;
    }

    # next codon position
    $StrPos += 3;
  }

  # return result
  if (wantarray) {
    return (\@aPos, \@aCodon, \@aProbab, $SeqTransl);
  } else {
    return \@aPos;
  }
}


# return array of translation stop sites in a nucleotide sequence
#
# INTERFACE
# - argument 1: source nucleotide sequence
#
# - options:
#   -debug      [STD]
#   -frame      specify translation frame, possible:
#               '+1' (default), '+2', '+3', '-1', '-2', '-3'.
#   -table      genetic code ID (numeric), according to NCBI/GenBank,
#               default: 1 = standard / universal.
#
# - return val: - array of translation stop sites:
#                 - reference to position array, numerical order referring
#                   to plus stranded sequence
#                 - reference to codon strings (RNA coded)
#                 - complete protein sequence from frame offset on
#               - undef if an error occurred
#
# DESCRIPTION
# - both, DNA and RNA coding is supported (bases T/U). In the translation
#   table codons are defined using U.
# - only translation in one single frame is checked, cmp. option -frame
# - currently, only standard translation table (#1) is supported
#
sub TranslStopArray {
  my ($sSeqNt, %opt) = @_;
  my ($debug, $pTranslTab, $frame);
  my ($sSeqRna, $iSeqLen, $StrPos, $codon, $sSeqProt);
  my (@PosArr);

  # function parameters
  $debug = $opt{-debug};
  unless ($pTranslTab = &_TranslTabAccess (-table=>$opt{-table},
    -debug=>$opt{-debug})) {
    $debug and printf STDERR "%s. ERROR: no translation table %d\n", &MySub,
      $opt{-table}||1;
    return undef;
  }
  $frame = $opt{-frame} || '+1';
  if ($frame < 0) { $sSeqNt = &SeqStrRevcompl($sSeqNt) }
  $sSeqRna = &SeqStrPure ($sSeqNt,-SeqType=>'RNA5', -upper=>1);
  $iSeqLen = length $sSeqRna;
  $StrPos = abs ($frame) - 1;
  if ($debug) {
    printf STDERR "%s. starting with parameters:\n", &MySub;
    printf STDERR "  frame: %s\n", $frame||"''";
    printf STDERR "  seq length: %d\n", $iSeqLen;
    printf STDERR "  string pos: %d\n", $StrPos;
  }

  # loop over codons
  while ($codon = substr ($sSeqRna, $StrPos, 3)) {
    if (length($codon) < 3) { last }

    # build up protein sequence
    $sSeqProt .= $$pTranslTab{$codon} || 'X';

    # look for translation start positions
    if ($$pTranslTab{$codon} eq '*') {
      $debug and printf STDERR "%s. found stop codon %s at position %d\n", &MySub,
        $codon, $StrPos;
      push @PosArr, ($frame<0) ? $iSeqLen-$StrPos : $StrPos+1;
    }

    # next codon position
    $StrPos += 3;
  }

  # debug
  $debug and printf STDERR "%s. stop codon array from sequence:\n  %s\n", &MySub,
    join (',', @PosArr);

  # return result
  if (wantarray) {
    return (\@PosArr, $sSeqProt);
  } else {
    return \@PosArr;
  }
}


1;
# $Id: SeqBench.pm,v 1.20 2018/06/05 18:02:56 szafrans Exp $
