################################################################################
#
#  kPerl Sequence Laboratory
#  Library for I/O of Sequence File Formats
#
#  copyright (c)
#    Fritz Lipmann Institute Jena, Dept. Genome Analysis, 2005, 2011
#    Karol Szafranski, 2007, 2009
#    Karol Szafranski at UPenn Philadelphia, Center for Bioinformatics, 2004
#    Institute of Molecular Biotechnology Jena, Dept. Genome Analysis, 1998-2004
#  author
#    Karol Szafranski, karol.szafranski@leibniz-fli.de
#
################################################################################
#
#  DESCRIPTION
#
# - individual description of functions at the beginning of the code blocks
#
################################################################################
#
#  FUNCTIONS, DATA
#
#   @EXPORT
#   %LibGlob  (not exported, but globally accessible)
#   %_LibParam
#
# - sequence identifiers
#   %reSeqid
#   %_SeqLink
#   &SeqidLink
#   note on complex sequence identifiers
#   %SeqidFunc
#   &SeqidAcc
#   &_SeqidAcc2
#   &_SeqidGi  (not exported, access via %SeqidFunc)
#   &_SeqidGidel  (not exported, access via %SeqidFunc)
#   &SeqidWord
#
# - sequence data structure and sub-structures
#   sequence data structure
#   standard keys for sequence data structure
#     $LibGlob{'Key*'}
#   sequence array data structure
#   sequence annotation data structure
#   sequence annotation group data structure
#
# - I/O meta architecture, I/O of plain formats
#   &SeqFileFormat
#   %SeqFFmtGet
#   %SeqFFmtOutFunc
#   &SeqarrFromFFmt
#   &SeqentryToFFmt
#
#   &SeqentryToId  (not exported, access via %SeqFFmtOutFunc)
#
#   &SeqentryFromPlain  (not exported, access via %SeqFFmtGet)
#   &SeqentryToPlain  (not exported, access via %SeqFFmtOutFunc)
#
#   &SeqarrFromTbl  (not exported, access via %SeqFFmtGet)
#   &SeqentryPopTbl  (not exported, access via %SeqFFmtGet)
#   &SeqentryToTbl  (not exported, access via %SeqFFmtOutFunc)
#
#  (&SeqarrFromStruct  (anonymous via %SeqFFmtGet)
#   &SeqentryPopStruct  (not exported, access via %SeqFFmtGet)
#  (&SeqentryToStruct  (anonymous via %SeqFFmtOutFunc)
#
# - I/O of FeatureTable file format
#   &SeqentryToFTab  (not exported, access via %SeqFFmtOutFunc)
#
# - I/O of GFF file format
#   $LibGlob{GFF}
#   &GffStruct
#   &SeqarrFromGff  (not exported, access via %SeqFFmtGet)
#   $_LibParam{AnnotLabel2Gff}
#   &SeqentryToGff  (not exported, access via %SeqFFmtOutFunc)
#
# - I/O of fastA (Pearson)
#   &SeqentryPopFasta
#   &SeqentryToFasta
#
# - I/O of pretty HTML
#   &SeqentryToPrettyHtml
#
# - I/O of Experiment file format (Staden package)
#   &SeqentryPopExper
#   &_SeqExperOptimize
#   &_SeqExperDicty
#   &SeqExperExpandSeqattrib
#   &_SeqExperClipQual
#   &SeqExperID
#   &SeqentryToExper
#
# - I/O of GenBank format
#   &SeqentryPopGenbank  (not exported, access via %SeqFFmtGet)
#   &_Genbank2Seq
#   &_Genbank2Source
#   &_Genbank2Annot
#   &SeqentryToGenbank  (not exported, access via %SeqFFmtOutFunc)
#
# - sequence string formatting and fragmentation
#   &_SeqStrCluster
#
# - sequence annotation formats
#   &AnnotSort
#   &AnnotRangeRes
#   &AnnotExpandTscr
#   &AnnotExpandSplice
#   $_LibParam{AnnotClip*}
#   &AnnotClip
#
#
#  STD OPTIONS
#
#  -debug      print debug protocol to STDERR
#  -KeyId      use specified key to access identifier (default:
#              $LibGlob{KeyId}).
#  -KeySeq     use specified key to access sequence string (default:
#              $LibGlob{KeySeq}).
#
################################################################################
#
#  DEBUG, CHANGES, ADDITIONS
#
# - Interchange the argument/option status of arg1 and option -handle in
#   &SeqentryPop* functions. This would better reflect the requirements:
#   An input handle is absolutely necessary, a filename is dispensable, though
#   the sequence data structure has a data field reserved for the source file
#   path.
#
# - The group feature data structure is incompletely implemented
#   - &AnnotRangeRes generates new groups that're not indexed in
#     $$pSeq{AnnotGrp}
#   - Is &AnnotExpandTscr doing a better job?
#     Especially, check the GenBank passage.
#
# - look also for notes in the header of each function block
#
################################################################################

package SeqLab::SeqFormat;

# includes
use strict; #use warnings;  # OK 20111027
use FileHandle;
use MainLib::StrRegexp qw ($sEndl $reEndl &LineBreak);
use MainLib::Data;
use MainLib::Path qw (%CorePath &PathExpand &PathChgSuffix);
use MainLib::File qw(&ReadFile);
use MainLib::Misc qw(&MySub);
use Math::kCalc;
use SeqLab::SeqBench;
use database::DbPlain qw(&PlainToTable);

# symbol export
use Exporter qw(import);
our @EXPORT = qw (
  %reSeqid &SeqidLink &LinkSeqid %SeqidFunc &SeqidAcc &SeqidWord
  &SeqFileFormat %SeqFFmtGet %SeqFFmtOutFunc &SeqarrFromFFmt &SeqentryToFFmt
  &SeqentryPopFasta &SeqentryToFasta
  &SeqentryToPrettyHtml
  &GffStruct
  &SeqentryPopExper &SeqExperExpandSeqattrib &SeqExperID &SeqentryToExper
  &AnnotSort &AnnotRangeRes &AnnotExpandTscr &AnnotExpandSplice
    &AnnotClip
  );

# package-wide constants and variables
our %LibGlob;
my %_LibParam;


################################################################################
# sequence identifiers
################################################################################


# regular expressions for sequence identifiers
#
our %reSeqid;

# regular expression for all possible sequence identifiers of common database
# sources
# - NOTE: RegExp contains brackets!
#   in case the regexp used solely you'll find the full identifier in $1
#
$reSeqid{All} = '\b('. join ('|',
  'gi\|\d{3,8}',                           # GenBank gids
  'embu?\|[A-Z]{1,2}\d{5,6}',              # EMBL
  'gbp?\|[A-Z]{1,3}\d{5,6}',               # GenBank nt / aa
  'spu?\|[A-Z]{1,2}\d{5,6}',               # SwissProt
  'dbj\|C\d{5}',                           # DDBJ
  '[a-z]{2,3}\|[A-Z]{1,2}\d{5,6}',         # diverse original IDs
  '[a-z]{2,3}\|\|w+',                      # missing ID numbers, hopefully significant segment descriptor!
  ) .')\b';

# stringent regular expression for sequence identifiers in the Dictyostelium
# read pool used for global genome assembly
# - disregard YAC reads
# - don't regard extra reads
#
$reSeqid{DictyRd} = '\b('. join ('|',
  'JAX4[a-dz]\d{2,3}[a-h]\d{2}\.[rs]\d',   # GSCJ genome shotgun
  'JC1[a-de]\d{2,3}[a-h]\d{2}\.[rs]\d',    # GSCJ chromosome 1
  'JC2[a-hx-z]\d{2,3}[a-h]\d{2}\.[rs]\d',  # GSCJ chromosome 2
  'JC3[a-fg]\d{2,3}[a-h]\d{2}\.[rs]\d',    # GSCJ chromosome 3
  'IIC\w+',                                # Baylor chromosome 4/5
  'IIA\w+',                                # Baylor chromosome 6
  'sdic2\w+\.\w+',                         # Sanger chromosome 2
  'sdi45\w+\.\w+',                         # Sanger chromosome 4/5
  'sdic6\w+\.\w+',                         # Sanger chromosome 6
  ) .')\b';

# stringent regular expression for sequence identifiers in the Dictyostelium
# GSCJ read pool
#
$reSeqid{GscjRd} = '\b('. join ('|',
  'JAX4[a-dz]\d{2,3}[a-h]\d{2}\.[rs]\d',   # genome shotgun
  'JC1[a-de]\d{2,3}[a-h]\d{2}\.[rs]\d',    # chromosome 1
  'JC2[a-hx-z]\d{2,3}[a-h]\d{2}\.[rs]\d',  # chromosome 2
  'JC3[a-fg]\d{2,3}[a-h]\d{2}\.[rs]\d',    # chromosome 3
  '13364a\d{2,3}[a-h]\d{2}\.[rs]\d',       # YAC DY3850
  '13536a\d{2,3}[a-h]\d{2}\.[rs]\d',       # YAC DY3307
  '13540a\d{2,3}[a-h]\d{2}\.[rs]\d',       # YAC DY3817
  '13541a\d{2,3}[a-h]\d{2}\.[rs]\d',       # YAC DY3167
  '13542a\d{2,3}[a-h]\d{2}\.[rs]\d',       # YAC DY3567
  '17679a\d{2,3}[a-h]\d{2}\.[rs]\d',       # YAC DY3142
  ) .')\b';


# WWW URL stems for links to extern sequence sources
#
# DESCRIPTION
# - special linking strategies are performed in &Blast::Blast2Html independent
#   of ID matching, e.g.:
#   - Dictyostelium GSC Jena gene models (ID /dd_\d+/)
#
my %_SeqLink = (
  Embl         => 'http://www.ebi.ac.uk/htbin/expasyfetch?',
  ExpEc        => 'http://expasy.hcuge.ch/cgi-bin/get-enzyme-entry?',
  GenbankNt    => 'http://'. $CorePath{www}{NCBI}{host} .'/entrez/query.fcgi?db=Nucleotide&cmd=Search&term=',
    # formerly http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?db=n&form=6&dopt=g&uid=
  GenbankProt  => 'http://'. $CorePath{www}{NCBI}{host} .'/entrez/query.fcgi?db=Protein&cmd=Search&term=',
    # formerly http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?db=s&form=6&dopt=g&uid=
  Pdb          => 'http://www.pdb.bnl.gov/pdb-bin/opdbshort?oPDBid=',
  Pir          => 'http://www.gdb.org/bin/bio/wais_q-bio?object_class_key=33&jhu_id=',
  PirGeorge    => 'http://www-nbrf.georgetown.edu/cgi-bin/nbrfget?',
  SwissExpasy  => 'http://www.expasy.ch/cgi-bin/sprot-search-ac?',
  );


# make HTML link from sequence identifier
#
# INTERFACE
# - argument 1: sequence identifier
# - return val: linked sequence identifier
#
# DESCRIPTION
# - this function is safe in the respect that a non-converte
# - &Blast::Blast2Html will invoke this only if the ID string first gives a
#   match to $reSeqid{All}.
#
sub SeqidLink {
  my ($sLink) = @_;

  # link to database
  $sLink =~ s#^gi\|(\d{3,8})$#<A HREF=\"$_SeqLink{GenbankNt}$1\">$&</A>#o;                      # NCBI gid
  $sLink =~ s#^(g[pb]u?|dbj)\|([A-Z]{1,2}\d{5,6})$#<A HREF=\"$_SeqLink{GenbankNt}$2\">$&</A>#o; # GenBank/GenPept
  $sLink =~ s#^(gb)p?\|([A-Z]{3}\d{5,6})$#<A HREF=\"$_SeqLink{GenbankProt}$1\|$2\">$&</A>#o;    # GenPept
  $sLink =~ s#^embu?\|([A-Z]{1,2}\d{5,6})$#<A HREF=\"$_SeqLink{Embl}$1\">$&</A>#o;              # Embl, evtl. => GenBank
  $sLink =~ s#^spu?\|([A-Z]{1,2}\d{5,6})$#<A HREF=\"$_SeqLink{SwissExpasy}$1\">$&</A>#o;        # SwissProt at Expasy
  $sLink =~ s#^pdb\|\w+$#<A HREF=\"$_SeqLink{Pdb}$1\">$&</A>#o;                                 # PDB
  $sLink =~ s#^pir\|([A-Z][A-Z0-9]\d{4})$#<A HREF=\"$_SeqLink{Pir}$1\">$&</A>#o;                # PIR
#  $sLink =~ s#^pir\|([A-Z][A-Z0-9]\d{4})$#<A HREF=\"$_SeqLink{PirGeorge}$1\">$&</A>#o;          # PIR at Univ. Georgetown
#  $sLink =~ s#^EC ([0-9\.]*[0-9])$#<A HREF=\"$_SeqLink{ExpEc}$1\">$&</A>#o;                     # Expasy EC access

  return $sLink;
}

# deprecated alias
sub LinkSeqid { return &SeqidLink(@_) }


# note on complex sequence identifiers
#
# DESCRIPTION
# - "complex sequence identifiers" mostly come from GenBank downloads and have
#   the typical form "gi|7544057|gb|AL049755.2|SPCC569"


our %SeqidFunc = (
  acc   => \&SeqidAcc,
  acc2  => \&_SeqidAcc2,
  acc3  => \&_SeqidAcc3,
  gi    => \&_SeqidGi,
  gidel => \&_SeqidGidel,
  word  => \&SeqidWord,
  ''    => sub { $_[0] },
  );


$_LibParam{reAccDb} = qr(dd?bj|emb|gb|gbp|ref|sp|trembl);

# grab accession number from complex sequence identifier
#
# INTERFACE
# - argument 1: sequence identifier
# - return val: simplified sequence identifier
#
# DESCRIPTION
# - A complex sequence identifier will be reduced to the bare accession number,
#   without database prefix or version suffix, if possible. If this grab fails,
#   at least any gi number statement is deleted.
#
sub SeqidAcc {
  my ($sId) = @_;

  # grab accession number
  if ($sId =~ m/\b(?:$_LibParam{reAccDb})\|([^|]+)/o) {
    $sId = $1;
    $sId =~ s/\.\d+$//;
  } else {
    $sId = &_SeqidGidel ($sId);
  }

  return $sId;
}


# grab accession number from complex sequence identifier -
# including database prefix and version suffix
#
# INTERFACE
# - argument 1: sequence identifier
# - return val: simplified sequence identifier
#
# DESCRIPTION
# - A complex sequence identifier will be reduced to the accession number,
#   without database prefix, if possible. If this grab fails, at least any
#   gi number statement is deleted.
#
sub _SeqidAcc2 {
  my ($sId) = @_;

  # grab full accession number
  if ($sId =~ m/\b(?:$_LibParam{reAccDb})\|([^|]+)/o) {
    $sId = $1;
  } else {
    $sId = &_SeqidGidel ($sId);
  }

  return $sId;
}


# grab accession number from complex sequence identifier -
# including database prefix and version suffix
#
# INTERFACE
# - argument 1: sequence identifier
# - return val: simplified sequence identifier
#
# DESCRIPTION
# - A complex sequence identifier will be reduced to the accession number,
#   including database prefix and version suffix, if possible. If this grab
#   fails, at least any gi number statement is deleted.
#
sub _SeqidAcc3 {
  my ($sId) = @_;

  # grab full accession number
  if ($sId =~ m/\b(?:$_LibParam{reAccDb})\|[^|]+/o) {
    $sId = $&;
  } else {
    $sId = &_SeqidGidel ($sId);
  }

  return $sId;
}


# grab gi identifier from complex sequence identifier
#
# INTERFACE
# - argument 1: sequence identifier
# - return val: simplified sequence identifier
#
# DESCRIPTION
# - A complex sequence identifier will be reduced to the bare gi number
#   statement, like "gi|7544057".
#
sub _SeqidGi {
  my ($sId) = @_;

  # grab gi number if present
  if ($sId =~ m/\bgi\|\d+\b/) {
    $sId = $&;
  }

  return $sId;
}


# delete gi identifier from complex sequence identifier
#
# INTERFACE
# - argument 1: sequence identifier
# - return val: simplified sequence identifier
#
# DESCRIPTION
# - A complex sequence identifier will be reduced to the bare gi number
#   statement, like "gi|7544057".
#
sub _SeqidGidel {
  my ($sId) = @_;

  # delete gi number if present
  $sId =~ s/(\|?)\bgi\|\d+\b(\|?)/($1 and $2)?$1:''/e;

  return $sId;
}


# grab single word from complex sequence identifier
#
# INTERFACE
# - argument 1: sequence identifier
# - return val: simplified sequence identifier
#
# DESCRIPTION
# - A complex sequence identifier will be reduced to the bare accession number,
#   or to the first encountered word.
#
sub SeqidWord {
  my ($sId) = @_;

  # try to grab accession number
  $sId = SeqidAcc ($sId);
  $sId =~ s/^\W*(?:(?:$_LibParam{reAccDb}|gi)\|)?(\w+).*/$1/o;

  return $sId;
}


################################################################################
# sequence data structure and sub-structures
################################################################################


# sequence data structure
#
# DESCRIPTION
# - structure is a hash constitutively containing following fields (keys):
#   id        identifier, should consist of characters [a-zA-Z_.-] only.
#   sequence  sequence string. The string may contain non-sequence
#             characters.
#   Both these hash field labels may differ from the standard definitions
#   above. This deviation from standard definiton is communicated to the
#   routines via function switches -KeyId and -KeySeq. Cmp. standard field
#   definitions below.
#
# - main additional fields:
#   SeqType   sequence type as diagnosed by &SeqLab::SeqAnalysis::SeqType
#             or provided in file formats: GenBank, ...
#   SrcFmt    file format of the source or destination sequence file.
#             Supported file types see &SeqLab::SeqAnalysis::$_LibParam{'Seq..'}
#   SrcPath   file path of the source or destination sequence file.
#   SrcMulti  flag claiming that source sequence file is a multiple-
#             sequence-entry file.
#             This is essentially needed where multi-sequence input files
#             will result in single-sequence output files, e.g.
#             Blast.pl -plain.
#
# - additional fields specific for some sequence sources:
#   descr     description field which can be derived from most sequence file
#             format types
#   header    field typical for sequence data loaded from a fastA file. This
#             data entry is preserved in order to keep long sequence IDs
#             typical of NCBI fastA downloads.
#   offset    offset in case of a sequence stretch being part of a bigger
#             sequence, especially in case of an alignment entry.
#   annot     annotations put on the sequence (currently supported in formats:
#             Experiment, GenBank). Reference to array of
#             annotation data structures. Structure definition see below.
#             The annotations can be expected to be sorted in a defined order,
#             see &AnnotSort.
#   AnnotGrp  annotation groups. Reference to hash of annotation group
#             data structures indexed by group ID.
#

# standard keys for sequence data structure
$LibGlob{KeyId} = 'id';
$LibGlob{KeySeq} = 'sequence';


# sequence array data structure
#
# DESCRIPTION
# - structure is an array of references to instances of sequence data
#   structures:
#   @SeqArray->%Seq
# - On the basis of this data structure the alignment data structure
#   is defined, see SeqAlign::Align.
#


# sequence annotation data structure
#
# DESCRIPTION
# - structure is a hash containing following fields (keys):
#   -         all position values are in biological notation.
#   type      annotation label
#   offset    offset position of annotated sequence range
#   ( offset_prelim  offset position is preliminary (minimal range)
#   end       end position of annotated sequence range
#   ( end_prelim  end position is preliminary (minimal range)
#   orient    strand direction of annotation. Never switch the values offset/end
#             in order to express negative direction!
#   range     this entry replaces fields offset/end/orient and implements range
#             syntax like in GenBank file format, cmp. &AnnotRangeRes.
#             This syntax will be conserved after primary parsing, but
#             further handling requires meta-type conversion as implemented
#             in &AnnotRangeRes (and &AnnotExpand* call &AnnotRangeRes)
#   group     group label of annotation (use it to combine a set of
#             annotations) to a larger logical structure. This is used in GFF
#             format to point out which features contribute to a set
#             describing a higher order feature.
#   layer     use it to combine a set of annotations. Later on, you may select
#             or group annotations according to the layer information.
#             By default, tags (annotations) from the Experiment file format
#             are put into different layers in order to keep informations like:
#             - consensus- or reading-bound
#             - quality clip positions (special tag syntax)
#             - vector positions (special tag syntax)
#   text      annotation text, plain version of 'qual' field
#   qual      reference to array of qualifiers, each reference to 2-element
#             array of:
#             0   qualifier type
#             1   qualifier value (text)
#

# sequence annotation group data structure
#
# DESCRIPTION
# - structures are kept
# - structure is a hash containing following fields (keys):
#   -         all position values are in biological notation.
#   type      annotation label
#   offset    offset position of leftmost annotated sequence range
#   end       end position of rightmost annotated sequence range
#   orient    strand direction of annotation. Never switch offset/end to
#             express negative direction!
#   range     often used instead of offset/end/orient. Note that we typically
#             deal with complex features
#   text      annotation text, often copied from or delegated to contributing
#             annotations. But, sometimes it would make sense to differenciate
#             the information according to group or member status.
#   layer     same as in sequence annotation data structure
#


################################################################################
# I/O meta architecture, I/O of plain formats
################################################################################


# determine sequence file format
#
# INTERFACE
# - argument 1: - path of source file
#                 Don't expect the code to work with '-' as an alias for \*STDIN
#               - filehandle reference (GLOB, FileHandle).
#
# - options:
#   -debug      [STD]
#
# - return val: - sequence file format descriptor (see "sequence data
#                 structure")
#               - undef if error occurs or end of input is encountered
#
# DESCRIPTION
# - If arg1 = filehandle is not seekable (e.g. input from pipe) then it should
#   be an OO FileHandle that supports multi-byte unget via method ungets().
#   This is fulfilled by CPAN's FileHandle::Unget.
#
sub SeqFileFormat {
  my ($ArgIn,%opt) = @_;
  my $debug = $opt{-debug};

  ##############################################################################
  # pre-work: read from input filehandle

  # ensure to have a buffering filehandle object
  my $hIn = ref($ArgIn) ? $ArgIn : FileHandle->new($ArgIn);
  $debug and printf STDERR "%s. input argument is %s\n", &MySub,
    ($hIn==$ArgIn) ? 'handle '.ref($ArgIn) : 'file';

  # first line, first character
  my ($line,@LineBuff);
  { # redo block: skip leading empty/comment lines
    $line = <$hIn>;
    unless (defined $line) { return undef }
    push @LineBuff, $line;
    if ($line=~m/^\s*$/ or $line=~m/^#/) { redo }
  }
  # unread lines that were read
  if (ref($ArgIn) and @LineBuff) {
    $debug and printf STDERR "%s. unreading %d input line%s\n", &MySub,
      int(@LineBuff), (int(@LineBuff)==1)?'':'s';
    if ($hIn->can('ungets')) {
      $debug and printf STDERR "%s. unreading via method ungets()\n", &MySub;
      $hIn->ungets(join('',@LineBuff));
    } else {
      $debug and printf STDERR "%s. unreading via seek()\n", &MySub;
      seek $hIn, -do{my $sum=0;foreach(@LineBuff){$sum+=length}$sum}, 1;
    }
  }

  # remind first line, first character
  my $char = substr ($line, 0, 1);
  $debug and printf STDERR "%s. first char '%s' in first non-empty line:\n->%s", &MySub,
    $char, $line;

  ##############################################################################
  # determine fromat

  my $format;
  if (0) { }

  # diagnose format by first character
  elsif ($char eq '>') { $format = 'fastA' }
  elsif ($char eq ';') { $format = 'GCG' }

  # diagnose format by first line
  elsif ($line =~ m/^LOCUS /) { $format = 'GenBank' }
  elsif ($line =~ m/^(__my_ref__|(?:HASH(?:\(\\w+\))?)?\{)/) { $format = 'struct' }
  elsif ($line =~ m/^[A-Z]{2} {3}[^ ]/) { $format = 'Experiment' }
  elsif ((!ref($ArgIn) and $ArgIn=~m/\.gff$/i) or
         $line =~ m/^([\w.-]+( +|\t)){3}(\d+( +|\t)){2}/) { $format = 'GFF' }
  elsif ($line =~ m/^[\w.-]{1,24}( +|\t)[\w*-]+$/) { $format = ($1 eq "\t")?'table':'selex' }

  # default format
  else { $format = 'plain' }

  $debug and printf STDERR "%s. format determined as: %s\n", &MySub, $format||"''";
  return $format;
}


# index of I/O functions
our %SeqFFmtGet = (
  Experiment => {
    FuncEntry => \&SeqentryPopExper,
    -ClipUnk  => 1,
    -MatchID  => 1,
    -pure     => 1,
    -SlcID    => 1,
    },
  fastA => {
    FuncEntry => \&SeqentryPopFasta,
    -MatchID  => 1,
    -pure     => 1,
    -SlcID    => 1,
    -SlcDescr => 1,
    },
  FeatureTable => {
    FuncEntry => undef,  # *** implement me ***
    FuncArr   => undef,  # *** implement me ***
    },
  GCG => {
    FuncEntry => undef,  # *** implement me ***
    FuncArr   => undef,  # *** implement me ***
    },
  GFF => {
    FuncArr   => \&SeqarrFromGff,
    -pure     => 0,
    },
  GenBank => {
    FuncEntry => \&SeqentryPopGenbank,
    -MatchID  => 1,
    -pure     => 1,  # GenBank sequences are always pure
    -SlcID    => 1,
    -SlcDescr => 1,
    -SlcType  => 1,
    },
  plain => {
    FuncEntry => \&SeqentryFromPlain,
    -MatchID  => 0,  # file format doesn't support ID field
    -pure     => 1,
    -SlcDescr => 0,  # file format doesn't support descriptions
    -SlcID    => 0,  # file format doesn't support ID field
    },
  PrettyHtml => undef,  # it's not intended to implement an input routine
  Staden => {
    FuncEntry => undef,  # *** implement me ***
    FuncArr   => undef,  # *** implement me ***
    },
  struct => {
    FuncEntry => \&SeqentryPopStruct,
    FuncArr   => sub { my ($f,%opt) = @_;
        my $d = &DataRead ($opt{-handle}||$f,%opt);
        return $d;
      },
    # standard parsing function doesn't support neither sequence-specific logical
    # selection nor sequence conversion:
    -MatchID  => 0,
    -pure     => 0,
    -SlcDescr => 0,
    -SlcID    => 0,
    },
  table => {
    FuncArr   => \&SeqarrFromTbl,
    FuncEntry => \&SeqentryPopTbl,
    -MatchID  => 0,
    -pure     => 0,
    -SlcDescr => 0,  # file format doesn't support descriptions
    -SlcID    => 0,
    },
  );
# file format aliasing (from perspective of parsing routine)
$SeqFFmtGet{selex} = $SeqFFmtGet{table};

our %SeqFFmtOutFunc = (
  Experiment   => \&SeqentryToExper,
  fastA        => \&SeqentryToFasta,
  FeatureTable => \&SeqentryToFTab,
  GCG          => undef,  # *** implement me ***
  GFF          => \&SeqentryToGff,
  GenBank      => \&SeqentryToGenbank,
  ID           => \&SeqentryToId,
  plain        => \&SeqentryToPlain,
  PrettyHtml   => \&SeqentryToPrettyHtml,
  selex        => \&SeqentryToTbl,
    # selex format output needs buffering of output!
    # Done by SeqLab::SeqStreamOut
  Staden       => undef,  # *** implement me ***
  struct       => sub {
      my ($pSeq,%opt) = @_;
        # &DataPrint default options may be overriden by arguments
      require MemHandle;
      my $hMem = MemHandle->new();
      $hMem = MemHandle->new();
      &DataPrint([$pSeq],-handle=>$hMem,-NoAddr=>1,-NoMyRef=>1,%opt);
      return $hMem->mem();
    },
  table        => \&SeqentryToTbl,
  );


# read sequence file of any format to sequence array data structure
#
# INTERFACE
# - argument 1: path of source file
#               Don't expect the code to work with '-' as an alias for \*STDIN
#               The function does not work with non-physical (piped) input.
#
# - options (see called subfunctions for option details):
#   -debug      [STD]
#   -fast       parse file quick and dirty. This option takes effect
#               only for the formats: Experiment, GenBank.
#   -format     specify sequence file format, default: determine from file
#   -KeyId      [STD]
#   -MatchID    select by matching to sequence identifier. This is not
#               necessarily supported by the parsing function. Selector may be:
#               ARRAY ref  list of sequence identifiers (not RegExps!)
#                          Hash is preferred due to faster performance.
#               HASH ref   keyed list of sequence identifiers (not RegExps!)
#   -pure       purify sequence strings for sequence-coding letters.
#               You may specify a sequence type. Then, fuzzy letters are
#               converted to official 'unknowns'.
#               Positional references are adjusted if needed.
#               Upper/lower-case appearance of the sequence string remains
#               unchanged.
#   -SlcID      select by regexp applied to sequence identifier. This is not
#               necessarily supported by the parsing function.
#
# - return val: - reference to sequence array data structure
#               - undef if an error occurred
#
sub SeqarrFromFFmt {
  my ($PathIn,%opt) = @_;
  my $debug = $opt{-debug};
  my ($FmtType, $pSeq, $pSeqarr);

  # ensure input file exists
  unless (-r $PathIn) {
    unless (-e $PathIn) {
      printf STDERR "%s. ERROR: input file %s doesn't exist\n", &MySub, $PathIn||"''";
    } else {
      printf STDERR "%s. ERROR: unable to read from input file %s\n", &MySub, $PathIn||"''";
    }
    return undef;
  }

  # determine input format
  $FmtType = $opt{-format} || &SeqFileFormat($PathIn,%opt);
  $debug and printf STDERR "%s. parsing sequence from %s format (%s)\n", &MySub,
    $FmtType||"''", $opt{-format} ? 'specified by caller' : 'determined from file';

  # get sequence(s)
  if (ref($SeqFFmtGet{$FmtType}{FuncArr}) eq 'CODE') {
    $pSeqarr = &{$SeqFFmtGet{$FmtType}{FuncArr}} ($PathIn,%opt);
  } elsif (ref($SeqFFmtGet{$FmtType}{FuncEntry}) eq 'CODE') {
    $opt{-handle} = FileHandle->new($PathIn);
    while (defined ($pSeq=&{$SeqFFmtGet{$FmtType}{FuncEntry}}($PathIn,%opt)) ) {
      push @$pSeqarr, $pSeq;
    }
  } else {
    printf STDERR "%s. ERROR: unknown sequence file format %s (input filter not defined)\n", &MySub, $FmtType||"''";
    return undef;
  }
  $debug and printf STDERR "%s. read %d seq entries\n", &MySub, int(@$pSeqarr);

  # return data
  return $pSeqarr;
}


# convert sequence data structure to file format
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
# - options (see called subfunctions for option details):
#   -debug      [STD]
#   -format     specify sequence file format, default: fastA
#   -KeyId      [STD]
#   -pure       purify sequence strings for sequence-coding letters.
#               NOTE: This option is not always supported in the specific
#               formatting function.
#               You may specify a sequence type. Then, fuzzy letters are
#               converted to official 'unknowns'.
#               Positional references are adjusted if needed.
#               Upper/lower-case appearance of the sequence string remains
#               unchanged.
#
# - return val: - sequence entry in plain file format
#               - undef if an error occurred
#
sub SeqentryToFFmt {
  my ($pSeq,%opt) = @_;
  my $ffmt = $opt{-format} || 'fastA';

  # get sequence(s)
  my ($sSeq);
  if (ref($SeqFFmtOutFunc{$ffmt}) eq 'CODE') {
    $sSeq = &{$SeqFFmtOutFunc{$ffmt}} ($pSeq,%opt);
  } else {
    printf STDERR "%s. ERROR: unknown sequence file format %s\n", &MySub, $ffmt||"''";
    return undef;
  }

  return $sSeq;
}


# convert sequence data structure to line containing sequence identifier
#
# INTERFACE
# - argument 1: reference to sequence data structure
# - options:
#   -KeyId      [STD]
# - return val: - sequence identifier
#               - undef if an error occurred
#
sub SeqentryToId {
  my ($pSeq,%opt) = @_;
  my ($KeyId);

  # function parameters
  $KeyId = $opt{-KeyId} || $LibGlob{KeyId};

  return $$pSeq{$KeyId}.$sEndl;
}


# read plain sequence file to sequence data structure
#
# INTERFACE
# - argument 1: path of source file
#               - Don't expect the code to work with '-' as an alias for \*STDIN
#               - In addition to the specification here, a source filehandle
#                 may be specified via switch -handle. Though, the path
#                 argument enables the function to enter a value for the
#                 according data field.
#
# - options:
#   -debug      [STD]
#   -handle     reference type GLOB, FileHandle
#               - cf. description for arg1
#               - handle won't be closed explicitly after work's done or
#                 an error occurs.
#   -KeyId      [STD]
#   -pure       purify sequence strings for sequence-coding letters.
#               By default, non-standard sequence-coding characters are
#               deleted.
#               You may specify a sequence type. Then, fuzzy letters are
#               converted to official 'unknowns'.
#               Upper/lower-case appearance of the sequence string remains
#               unchanged.
#
# - return val: - reference to sequence data structure
#               - undef if an error occurred
#
sub SeqentryFromPlain {
  my ($PathIn,%opt) = @_;
  my $debug = $opt{-debug};
  my $KeyId = $opt{-KeyId} || $LibGlob{KeyId};

  # read sequence string from file
  my $sSeq;
  unless ($sSeq = &ReadFile($opt{-handle}||$PathIn)) {
    $debug and printf STDERR "%s. ERROR: unable to read from %s\n", &MySub,
      $opt{-handle} ? 'handle' : ('file '. ($PathIn||"''"));
    return undef;
  }
  if ($opt{-pure}) {
    $sSeq = &SeqStrPure ($sSeq, -SeqType=>$opt{-pure});
  } else {
    $sSeq =~ tr/ \t\n\r//d;
  }
  $sSeq or return undef;

  # process sequence ID
  $PathIn =~ m|^(.*/)?([\w-]+)|;
  my $sId = $2 || 'UnknownSeq';

  # enter sequence data fields
  my %seq = (
    $KeyId   => $sId,
    sequence => $sSeq,
    SrcPath  => (-f $PathIn) ? &PathExpand($PathIn) : $PathIn,
    SrcFmt   => 'plain',
    SrcMulti => 0,  # is always single-sequence file
    );

  # return sequence entry reference
  return \%seq;
}


# convert sequence data structure to file format 'plain'
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
# - options:
#   -debug      [STD], handed over to &_SeqStrCluster
#   -KeySeq     [STD]
#   -pure       purify sequence strings for sequence-coding letters.
#               You may specify a sequence type. Then, fuzzy letters are
#               converted to official 'unknowns'.
#               Upper/lower-case appearance of the sequence string remains
#               unchanged.
#
# - return val: - sequence entry in plain file format
#               - undef if an error occurred
#
sub SeqentryToPlain {
  my ($pSeq,%opt) = @_;
  my $KeySeq = $opt{-KeySeq} || $LibGlob{KeySeq};
  return &_SeqStrCluster ($$pSeq{$KeySeq}, -blocks=>1, -chars=>60, %opt);
}


# read array of sequence data structures from table-formatted seq file
#
# INTERFACE
# - argument 1: path of source file
#               - Don't expect the code to work with '-' as an alias for \*STDIN
#               - In addition to the specification here, a source filehandle
#                 may be specified via switch -handle. Though, the path
#                 argument enables the function to enter a value for the
#                 according data field.
#
# - options:
#   -debug      [STD]
#   -handle     reference type GLOB, FileHandle
#               - cf. description for arg1
#               - handle won't be closed explicitly after work's done or
#                 an error occurs.
#   -KeyId      [STD]
#   -KeySeq     [STD]
#
# - return val: - reference to array of sequence entries
#               - undef if an error occurred
#
sub SeqarrFromTbl {
  my ($PathIn,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2 = $debug ? $debug-1 : undef;
  my $KeyId = $opt{-KeyId}  || $LibGlob{KeyId};
  my $KeySeq = $opt{-KeySeq} || $LibGlob{KeySeq};

  # read table format
  my $pSeqArr = &PlainToTable ($opt{-handle} || $PathIn,
    -TabType=>'AH', -delimit=>{ line=>$reEndl, col=>' +|\t' },
    -comments=>1, -ColLabel=>[$KeyId,$KeySeq], -debug=>$dbg2);

  # enter source fields
  foreach (@$pSeqArr) {
    if (length ($PathIn)) {
    $_->{SrcPath}  = (-f $PathIn) ? &PathExpand($PathIn) : $PathIn;
    }
    $_->{SrcFmt}   = 'table';
    $_->{SrcMulti} = (int(@$pSeqArr) > 1);
  }

  return $pSeqArr;
}


# read sequence data structure from table-formatted seq file
#
# INTERFACE
# - argument 1: path of source file
#               - Don't expect the code to work with '-' as an alias for \*STDIN
#               - In addition to the specification here, a source filehandle
#                 may, and should, be specified via switch -handle. Though, the
#                 path argument enables the function to enter a value for the
#                 according data field.
#
# - options:
#   -debug      [STD]
#   -handle     reference type GLOB, FileHandle
#               - cf. description for arg1
#               - unless this option is used, it's impossible to read multiple
#                 sequences from a multi-sequence file via this function.
#   -KeyId      [STD]
#   -KeySeq     [STD]
#
# - return val: - reference to array of sequence entries
#               - undef if there's no additional sequence entry
#               - undef if an error occurred
#
sub SeqentryPopTbl {
  my ($PathIn,%opt) = @_;
  my $debug = $opt{-debug};
  my $KeyId  = $opt{-KeyId}  || $LibGlob{KeyId};
  my $KeySeq = $opt{-KeySeq} || $LibGlob{KeySeq};

  # ensure to have a filehandle
  if (!$opt{-handle} and $opt{-debug}) {
    printf STDERR "%s. WARNING: sequential reading does not make sense with a non-handle input argument\n", &MySub;
  }
  my $hIn = $opt{-handle} || FileHandle->new($PathIn);
  if (ref($hIn) eq 'GLOB') {
    $hIn = FileHandle->new_from_fd($hIn,'r') or die sprintf "%s. fdup ERROR", &MySub;
  }

  # multi-sequence input?
  my $bSrcMulti = int ($hIn->tell()>0);

  # read table line
  my ($line,$pSeq);
  unless (defined ($line=$hIn->getline())) { return undef }
  chomp $line;
  if ($line =~ m/\s+/) {
    $$pSeq{$KeyId} = $`;
    $$pSeq{$KeySeq} = $';
  } else {
    $debug and printf STDERR "%s. format ERROR, input line %d\n", &MySub, $.;
    return undef;
  }

  # enter source fields
  if (length ($PathIn)) {
  $$pSeq{SrcPath}  = (-f $PathIn) ? &PathExpand($PathIn) : $PathIn;
  }
  $$pSeq{SrcFmt} = 'table';
  $$pSeq{SrcMulti} = $bSrcMulti || int(!$hIn->eof());

  # return sequence entry reference
  return $pSeq;
}


# convert sequence data structure to table file format
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
# - options:
#   -delimit    field delimiter string, default: "\t"
#   -KeyId      [STD]
#   -KeySeq     [STD]
#
# - return val: - sequence entry in table file format
#               - undef if an error occurred
#
sub SeqentryToTbl {
  my ($pSeq,%opt) = @_;
  my $KeyId  = $opt{-KeyId}  || $LibGlob{KeyId};
  my $KeySeq = $opt{-KeySeq} || $LibGlob{KeySeq};
  my $delimit = (exists($opt{-delimit}) and length($opt{-delimit})) ?
    $opt{-delimit} : "\t";
  return $$pSeq{$KeyId} . $delimit . $$pSeq{$KeySeq} . $sEndl;
}


# read plain sequence file to sequence data structure
#
# INTERFACE
# - argument 1: path of source file
#               - Don't expect the code to work with '-' as an alias for \*STDIN
#               - In addition to the specification here, a source filehandle
#                 may, and should, be specified via switch -handle. Though, the
#                 path argument enables the function to enter a value for the
#                 according data field.
#
# - options:
#   -debug      [STD]
#   -handle     reference type GLOB, FileHandle
#               - cf. description for arg1
#               - unless this option is used, it's impossible to read multiple
#                 sequences from a multi-sequence file via this function.
#               - If the filehandle is not seekable (e.g. input from pipe) then
#                 it should be an OO FileHandle that supports multi-byte unget
#                 via method ungets(). This is fulfilled by CPAN's
#                 FileHandle::Unget.
#
# - return val: - reference to sequence data structure
#               - undef if an error occurred
#
sub SeqentryPopStruct {
  my ($PathIn,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2 = $debug ? $debug-1 : undef;

  # ensure input handle
  if (!$opt{-handle} and $opt{-debug}) {
    printf STDERR "%s. WARNING: sequential reading does not make sense with a non-handle input argument\n", &MySub;
  }
  my $hIn = $opt{-handle} || FileHandle->new($PathIn);
  if (ref($hIn) eq 'GLOB') {
    $hIn = FileHandle->new_from_fd($hIn,'r') or die sprintf "%s. fdup ERROR", &MySub;
  }

  # multi-sequence input?
  my $bSrcMulti = int ($hIn->tell()>0);

  # skip data structure scaffold
  { my $line;
    while (defined($line=<$hIn>) and $line=~m/^(__my_ref__|(?:HASH(?:\(\\w+\))?)?\{|\}\n)/) { }
    # no more sequence entries?
    if (! length($line)) { return undef }
    # unread lastly read line, i.e. first line of next following data structure
    if ($hIn->can('ungets')) {
      $hIn->ungets($line);
    } else {
      seek $hIn, -length($line), 1;
    }
  }

  # read sequence entry
  my $pSeq = &DataRead ($hIn, -debug=>$dbg2);
  $debug and printf STDERR "%s. read seq %s, %d data fields\n", &MySub,
    $$pSeq{id}, int(keys %$pSeq);
  unless (exists($$pSeq{id}) and exists($$pSeq{sequence})) {
    $debug and printf STDERR "%s. WARNING: missing basic fields \"id\" or \"sequence\", seq %s\n", &MySub,
      $$pSeq{id}||$PathIn||$hIn||"''";
  }

  # add sequence source data fields
  if (length ($PathIn)) {
  $$pSeq{SrcPath}  = (-f $PathIn) ? &PathExpand($PathIn) : $PathIn;
  }
  $$pSeq{SrcFmt}   = 'struct';
  $$pSeq{SrcMulti} = $bSrcMulti || int(!$hIn->eof());

  # return sequence entry reference
  return $pSeq;
}


################################################################################
# I/O of FeatureTable file format
################################################################################


# convert sequence data structure to file format "FeatureTable"
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
# - options:
#   -debug      [STD]
#   -pure       purify sequence strings for sequence-coding letters.
#               *** not implemented ***
#
# - return val: - sequence entry in plain file format
#               - undef if an error occurred
#
sub SeqentryToFTab {
  my ($pSeq,%opt) = @_;
  my $debug = $opt{-debug};

  my ($pAnnot, @range, $CtRange, @SeqLine);

  # format annotation qualifiers
  foreach $pAnnot (@{$$pSeq{annot}}) {
    if (! exists($$pAnnot{qual}) and $$pAnnot{text}) {
      while ($$pAnnot{text} =~ m/^(\w+)=(.+)$/g) {
        push @{$$pAnnot{qual}}, [$1, $2];
      }
    }
    if (exists($$pAnnot{range}) and $$pAnnot{range}) {
      @range = ();
      while ($$pAnnot{range} =~ m/([<>=]*\d+)\.\.([<>=]*\d+)/g) {
        push @range, [$1, $2];
      }
      if ($$pAnnot{range} =~ m/complement/) {
        @range = map { [$_->[1],$_->[0]] } reverse @range;
      }
    } else {
      @range = ($$pAnnot{orient} >= 0) ?
        ([$$pAnnot{offset},$$pAnnot{end}]) : ([$$pAnnot{end},$$pAnnot{offset}]);
    }
    for ($CtRange=0; $CtRange<@range; $CtRange++) {
      push @SeqLine, sprintf ("%s\t%s%s\n",
        $range[$CtRange][0], $range[$CtRange][1],
        ($CtRange==0) ? "\t".$$pAnnot{type}:'');
    }
    foreach (@{$$pAnnot{qual}}) {
      push @SeqLine, sprintf ("\t\t\t%s\t%s\n", $_->[0], $_->[1]);
    }
  }

  return join('',@SeqLine);
}


################################################################################
# I/O of GFF file format
################################################################################


$LibGlob{GFF} = {
  ColIn  => [qw(id    method feature offset end score orient      frame group text)],
  ColOut => [qw(seqid method feature start  end score orientation frame group comment)],
  };


# read GFF file into GFF data structure
#
# INTERFACE
# - argument 1: input argument, either:
#               - path of source file
#               - filehandle reference
#
# - options:
#   -debug      [STD]
#
# - return val: - reference to GFF data structure, which is:
#                 array of hashes with keys as defined in $LibGlob{GFF}
#               - undef if an error occurred
#
sub GffStruct {
  my ($ArgIn,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;

  # filehandle from input argument
  my $hIn = ref($ArgIn) ? $ArgIn : FileHandle->new($ArgIn);

  # parse tabular format
  my ($pGffStruct);
  unless ($pGffStruct = &PlainToTable ($hIn, -TabType=>'AH',
    -comments=>1, -ColLabel=>$LibGlob{GFF}{ColIn}, -debug=>$dbg2)) {
    $debug and printf STDERR "%s. ERROR: no data structure from *.GFF input %s\n", &MySub, $ArgIn||"''";
    return undef;
  }
  $debug and printf STDERR "%s. %d entries in GFF file %s\n", &MySub, int @$pGffStruct, $ArgIn;

  # translate orientation syntax
  map { $_->{orient} = ($_->{orient} =~ m/^[+-]$/) ? $_->{orient}.'1' : 0; }
    @$pGffStruct;

  # translate group syntax
  foreach (@$pGffStruct) {
    if ($_->{group}=~m/Parent=(.+?);.*Target=/) { $_->{group}=$1 }
  }

  # translate comment => text
  foreach (@$pGffStruct) {
    unless (length($_->{text})) { delete $_->{text}; next; }
    $_->{text} =~ s/'(\\[rnt])'/eval('"'.$1.'"')/eg;
  }

  # return GFF data reference
  $dbg2 and &DataPrint ($pGffStruct, -handle=>\*STDERR);
  return $pGffStruct;
}


# read GFF file and accompanying sequence file to sequence data structure
#
# INTERFACE
# - argument 1: path of source file
#               - Don't expect the code to work with '-' as an alias for \*STDIN
#               - In addition to the specification here, a source filehandle
#                 may be specified via switch -handle. Though, the path
#                 argument enables the function to enter a value for the
#                 according data field.
#                 And - especially for GFF - we'll need a file path to retrieve
#                 an accompanying sequence file.
#
# - options:
#   -debug      [STD]
#   -handle     reference type GLOB, FileHandle
#               - cf. description for arg1
#               - handle won't be closed explicitly after work's done or
#                 an error occurs.
#
# - return val: - reference to sequence data structure
#               - undef if an error occurred
#
sub SeqarrFromGff {
  my ($PathGff,%opt) = @_;
  my $debug = $opt{-debug} || 0;
  my (%SeqIdx,%SeqIdOrder,@seq);

  # read GFF file: table AH -> $hash{$SeqID}
  my $pGffStruct = &GffStruct($opt{-handle}||$PathGff,%opt);
  unless ($pGffStruct) {
    die sprintf "%s. ERROR: no data structure from *.GFF file %s\n", &MySub, $PathGff||"''";
  }
  my $CtId;
  foreach my $pAnnot (@$pGffStruct) {
    $SeqIdOrder{$$pAnnot{id}} ||= ++ $CtId;
    $$pAnnot{type} = $$pAnnot{feature};
    delete $$pAnnot{feature};
    $SeqIdx{$$pAnnot{id}} ||= { id=>$$pAnnot{id}, annot=>[], sequence=>'' };
    push @{$SeqIdx{$$pAnnot{id}}{annot}}, $pAnnot;
    delete $$pAnnot{id};
  }
  undef $pGffStruct;

  # sort annotations
  foreach (values %SeqIdx) {
    @{$_->{annot}} = &AnnotSort (@{$_->{annot}});
  }
  if ($debug>1) {
    printf STDERR "%s. full sequence index data structure:\n%s", &MySub;
    &DataPrint (\%SeqIdx, -handle=>\*STDERR, -space=>2);
  }

  # read accompanying sequence file
  # join sequence string into into sequence data structure that already contains
  #   annotation data structure (from GFF)
  my $PathGffStamp = &PathChgSuffix ($PathGff, '', -last=>1);
  my $PathSeq = (
    grep { -r $_ and -f $_} map { $PathGffStamp.$_ }
    '.fa', '.fasta', '.tbl', '.table', '.pln', ''
    )[0];
  if ($PathSeq) {
    @seq = grep { exists($SeqIdx{$_->{id}}) }
      map {@{$_||[]}} &SeqarrFromFFmt($PathSeq,-debug=>$debug);
    if ($debug) {
      printf STDERR "%s. found accompanying sequence file for *.GFF file, %d (selected) entries\n", &MySub, int(@seq);
      printf STDERR "  seq path %s\n", $PathSeq||"''";
      printf STDERR "  GFF path %s\n", $PathGff||"''";
    }
    foreach (@seq) {
      $SeqIdx{$_->{id}}{sequence} = $_->{sequence};
    }
  } else {
    printf STDERR "%s. WARNING: unable to find accompanying sequence file for *.GFF file\n", &MySub;
    printf STDERR "  GFF file %s\n", $PathGff||"''";
    printf STDERR "  GFF stamp: %s\n", $PathGffStamp||"''";
  }
  # report missing sequences
  my @SeqMiss = map { $_->{id} } grep { ! $_->{sequence} } values %SeqIdx;
  if (@SeqMiss) {
    printf STDERR "%s. WARNING: didn't retrieve sequences for %d GFF entr%s\n", &MySub,
    printf STDERR "  missing seqs: %s\n",
      join(' ',&ListMaxfirst(\@SeqMiss,5,-ElemExceed=>'...'));
    printf STDERR "  accompanying seq file: %s\n", $PathSeq;
    printf STDERR "  seqs in file: %s\n",
      join (' ', map {$_->{id}||"''"} &ListMaxfirst(\@seq,5,-ElemExceed=>{id=>'...'}));
  }

  # order seqs as in original GFF file, refine sequence data structures
  @seq = sort {
    $SeqIdOrder{$$a{id}} <=> $SeqIdOrder{$$b{id}}
  } map {
    if (length ($PathGff)) {
      $_->{SrcPath} = (-f $PathGff) ? &PathExpand($PathGff) : $PathGff;
    }
    $_->{SrcFmt} = 'GFF';
    $_->{SrcMulti} = (int(@seq)>1) ? 1 : 0;
    $_;
  } values %SeqIdx;

  # debug, return
  if ($debug>1) {
    printf STDERR "%s. full sequence array data structure:\n%s", &MySub;
    &DataPrint (\@seq, -handle=>\*STDERR, -space=>2);
  }
  return \@seq;
}


# translation table for annotation labels coming from different sources
# key    source label
# value  taget label
#
$_LibParam{AnnotLabel2Gff} = {
  standard => {
    CDS_exon => 'exon',
    },
  geneid => {
    },
  };

# format sequence data structure to GFF file format
#
# INTERFACE
# - argument 1: reference to sequence data structure
#               referenced data is left unchanged
#
# - options:
#   -debug      [STD]
#   -FormatSub  GFF sub-format, supported:
#               standard  ...
#               geneid    (default) Exons are organized in four categories.
#   -KeyId      [STD]
#   -pure       purify sequence strings for sequence-coding letters.
#               Positional references in annotation entries are adjusted.
#
# - return val: plain GFF file format
#
# DEBUG, CHANGES, ADDITIONS
# - &DataClone will disconnect referencing between $$pSeq{annot} and
#   $$pSeq{AnnotGrp}.
# - use of a translation table for annotation labels has experimental status.
#
sub SeqentryToGff {
  my ($pSeq,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2 = $debug ? $debug-1 : undef;
  my $KeyId = $opt{-KeyId} || $LibGlob{KeyId};
  my $FormatSub = $opt{-FormatSub} || 'geneid';

  # validate arguments & options
  %$pSeq or return '';
  # BEWARE: annotation data may be cross-linked after call of &AnnotExpandTscr
  $pSeq = &DataClone ($pSeq);
  $debug and printf STDERR "%s. working on seq %s\n", &MySub, $$pSeq{$KeyId}||"''";

  # purify sequence
  if ($opt{-pure}) {
    $pSeq = &SeqPure ($pSeq, -SeqType=>$opt{-pure}, -debug=>$dbg2);
  }

  my ($GffPlain);
  my ($pAnnot, %CdsGrp, $ItGrp, $CtLen);

  ##############################################################################
  # data conversion
  if ($$pSeq{annot} and @{$$pSeq{annot}}) {

    # have to do annotation conversion?
    if ($$pSeq{SrcFmt} ne 'GFF') {

      # expand annotation data structure
      # - complex range split feature of via &AnnotRangeRes
      # - nice expansion of gene annotations
      &AnnotExpandTscr ($pSeq, -debug=>$dbg2);

      # translate annotation labels to common GFF syntax
      foreach $pAnnot (@{$$pSeq{annot}}) {

        # special to subtype "geneid"
        # translate exon labels to geneid syntax
        if ($FormatSub eq 'geneid' and $$pAnnot{type} eq 'CDS_exon') {
          $ItGrp = $$pAnnot{group};
          $CdsGrp{$ItGrp} ||= [
            sort { $$a{offset}*$$a{orient} <=> $$b{offset}*$$b{orient} }
            grep { $_->{type} eq 'CDS_exon' } @{$$pSeq{AnnotGrp}{$ItGrp}{member}}
            ];
          if (@{$CdsGrp{$ItGrp}} == 1) {
            $$pAnnot{type} = 'single';
          } elsif (@{$CdsGrp{$ItGrp}} > 2 and ${$CdsGrp{$ItGrp}}[0] ne $pAnnot and ${$CdsGrp{$ItGrp}}[-1] ne $pAnnot) {
            $$pAnnot{type} = 'internal';
          } elsif (${$CdsGrp{$ItGrp}}[0] eq $pAnnot) {
            $$pAnnot{type} = 'first';
          } elsif (${$CdsGrp{$ItGrp}}[-1] eq $pAnnot) {
            $$pAnnot{type} = 'terminal';
          }
        }

        # translate labels according to translation table
        if ($FormatSub and exists $_LibParam{AnnotLabel2Gff}{$FormatSub}{$$pAnnot{type}}) {
          $$pAnnot{type} = $_LibParam{AnnotLabel2Gff}{$FormatSub}{$$pAnnot{type}};
        }
        if (exists $_LibParam{AnnotLabel2Gff}{standard}{$$pAnnot{type}}) {
          $$pAnnot{type} = $_LibParam{AnnotLabel2Gff}{standard}{$$pAnnot{type}};
        }

        # comment/text field need to be single-line
        if (exists($$pAnnot{text}) and length($$pAnnot{text})) {
          $$pAnnot{text} =~ s/\n/'\\n'/g;
          $$pAnnot{text} =~ s/\r/'\\r'/g;
          $$pAnnot{text} =~ s/\t/'\\t'/g;
        }
      }  # end foreach $pAnnot

      # create frame field conforming to geneid syntax
      if ($FormatSub eq 'geneid') {
        foreach $ItGrp (values %CdsGrp) {
          $CtLen = 3;
          foreach $pAnnot (@$ItGrp) {
            $$pAnnot{frame} ||= ($CtLen % 3) ? 3 - ($CtLen % 3) : 0;
            $CtLen = $$pAnnot{end} - $$pAnnot{offset} + 1 - $$pAnnot{frame} + 3;
          }
          $debug and printf STDERR "%s. seq %s, transcript %s, first exon starting frame (post) %s\n", &MySub,
            $$pSeq{$KeyId}||"''", $$ItGrp[0]{group}||"''", $$ItGrp[0]{frame};
        }
      }
    }
  }  # end if any annotation

  ##############################################################################
  # formatting

  # sequence info, sequence source
  $GffPlain .= sprintf "# sequence %s\n", $$pSeq{$KeyId};
  if ($$pSeq{SrcPath} or $$pSeq{SrcFmt}) {
    $GffPlain .= "# sequence originated from:\n";
    $GffPlain .= sprintf "#   file: %s\n", $$pSeq{SrcPath}||"''";
    $GffPlain .= sprintf "#   file format: %s\n", $$pSeq{SrcFmt}||"''";
    $GffPlain .= "#\n";
  }

  # annotations (GFF supports annotation only)
  if ($$pSeq{annot} and @{$$pSeq{annot}}) {

    # column labels
    $GffPlain .= "# column labels:\n";
    $GffPlain .= sprintf "# %s\n", join ("\t", @{$LibGlob{GFF}{ColIn}});

    # loop over annotations
    # annotation entries are expected to be in standar order, cf. &AnnotSort
    foreach $pAnnot (@{$$pSeq{annot}}) {
      $debug and printf STDERR "%s. annotation: label %s, text %d chars\n", &MySub, $$pAnnot{type}, length $$pAnnot{text};
      $GffPlain .= sprintf "%s\n", join ("\t",
        $$pSeq{$KeyId},
        $$pAnnot{method} || $$pSeq{SrcFmt} || '.',
        $$pAnnot{type},
        $$pAnnot{offset},
        $$pAnnot{end},
        $$pAnnot{score} || '.',
        &SignChar ($$pAnnot{orient}, -allow0=>1),
        exists($$pAnnot{frame}) ? $$pAnnot{frame} : ($$pAnnot{offset}-1)%3,
        $$pAnnot{group} || '.',
        $$pAnnot{text} ? "$$pAnnot{text}" : (),
        );
    }
  }

  return $GffPlain;
}


################################################################################
# I/O of fastA (Pearson)
################################################################################


# read next sequence data structure entry from fastA file
#
# INTERFACE
# - argument 1: path of source file.
#               - Don't expect the code to work with '-' as an alias for \*STDIN
#               - In addition to the specification here, a source filehandle
#                 may, and should, be specified via switch -handle. Though, the
#                 path argument enables the function to enter a value for the
#                 according data field.
#
# - options:
#   -debug      [STD]
#   -handle     reference type GLOB, FileHandle
#               - cf. description for arg1
#               - unless this option is used, it's impossible to read multiple
#                 sequences from a multi-sequence file via this function.
#               - If the filehandle is not seekable (e.g. input from pipe) then
#                 it should be an OO FileHandle that supports multi-byte unget
#                 via method ungets(). This is fulfilled by CPAN's
#                 FileHandle::Unget.
#   -IdSimple   try to decomplex sequence identifier field (deprecated)
#   -KeyId      [STD]
#   -MatchID    select by matching to sequence identifier. Selector may be:
#               ARRAY ref  list of sequence identifiers (not regexps!)
#                          Hash is preferred due to faster performance.
#               HASH ref   keyed list of sequence identifiers (not regexps!)
#                          The hash values need to evaluate to boolean TRUE.
#   -pure       purify sequence string according to specified sequence type,
#               By default, non-standard sequence-coding characters are
#               deleted.
#               You may specify a sequence type. Then, fuzzy letters are
#               converted to official 'unknowns'.
#               Upper/lower-case appearance of the sequence string remains
#               unchanged.
#   -SlcID      select by regexp applied to sequence identifier
#   -SlcDescr   select by regexp applied to sequence description field
#
# - return val: - reference to sequence entry
#               - undef if there's no additional sequence entry
#               - undef if an error occurred
#
# DESCRIPTION
# - The purpose of this function is to read one single sequence entry from
#   a file or, more adequate, from an open file handle.
# - Field SrcMulti reflects the situation quite sensitive. However, erroneous
#   or filtered sequence entries that're preceding or following the returned
#   sequence entry will be counted as additional entries. So, there may be
#   some cases of SrcMulti==1 where only one entry was retrieved from the
#   sequence file (after selection and error skipping).
#
sub SeqentryPopFasta {
  my ($PathIn,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2 = $debug ? $debug-1 : undef;
  my $KeyId = $opt{-KeyId} || $LibGlob{KeyId};

  # filehandle from input arguments
  if (!$opt{-handle} and $opt{-debug}) {
    printf STDERR "%s. WARNING: sequential reading does not make sense with a non-handle input argument\n", &MySub;
  }
  my $hIn = $opt{-handle} || FileHandle->new($PathIn);
  if (ref($hIn) eq 'GLOB') {
    $hIn = FileHandle->new_from_fd($hIn,'r') or die sprintf "%s. fdup ERROR", &MySub;
  }

  # prepare selectors from selector arguments
  my (%MatchID, $SlcIdRE, $SlcDescrRE);
  if      (ref($opt{-MatchID}) eq 'HASH') {
    %MatchID = %{$opt{-MatchID}};
    $debug and printf STDERR "%s. using hash selector for identifiers:\n  %s\n", &MySub,
      join (' ', keys %MatchID);
  } elsif (ref($opt{-MatchID}) eq 'ARRAY') {
    %MatchID = map { ($_=>1) } @{$opt{-MatchID}};
    $debug and printf STDERR "%s. using hash selector for identifiers (built from list):\n  %s\n", &MySub,
      join (' ', keys %MatchID);
  }
  if ($opt{-SlcID}) {
    $SlcIdRE = $opt{-SlcID};
    $debug and printf STDERR "%s. using regexp selector for identifiers: %s\n", &MySub, $SlcIdRE;
  }
  $SlcDescrRE = $opt{-SlcDescr};

  ##############################################################################
  # parsing

  # multi-sequence input?
  my $bSrcMulti = int ($hIn->tell()>0);

  # loop until successful load of one entry
  my %SeqEntry;
  {
    my $line;

    # read header, ensure there's one
    if (!defined($line=<$hIn>) or $line!~m/^>/) {
      $debug and printf STDERR "%s. ERROR: missing header, input line %d\n->%s", &MySub,
        $., $line;
      return undef;
    }
    my $header = $line;
    chomp $header;

    # read sequence
    # grab everything until next header or end of file
    my @SeqLn;
    while (defined($line=<$hIn>) and $line!~m/^>/) {
      push @SeqLn, $line;
    }
    unless (@SeqLn) {
      $debug and printf STDERR "%s. WARNING: sequence entry without sequence information\n"
        . "  header: $header\n", &MySub;
    }

    # reposition on beginning of next sequence entry (the one we've started to
    #   read). This is why FileHandle objects have to refer to true files here
    if (defined($line) and length($line)) {
      if ($hIn->can('ungets')) {
        $debug and printf STDERR "%s. unreading via method ungets()\n", &MySub;
        $hIn->ungets($line);
      } else {
        seek $hIn, -length($line), 1;
      }
    }

    # extract identifier from header whatever it is like
    # the rest is description
    my ($sId,$sDescr);
    if ($header =~ m/^>(\S+)(\s+|$)/) {
      $sId = $1;
      $sDescr = $';
    } else {
      $sId = $sDescr = '';
      printf STDERR "%s. WARNING: no sequence identifier in header:\n  $header\n", &MySub;
    }

    # enter sequence entry
    %SeqEntry = (
      $KeyId   => $sId,
      sequence => join ('', @SeqLn),
      header   => $header,
      descr    => $sDescr,
      SrcPath  => (-f $PathIn) ? &PathExpand($PathIn) : $PathIn,
      SrcFmt   => 'fastA',
      SrcMulti => $bSrcMulti || int(!$hIn->eof()),
      );
    $debug and printf STDERR "%s. entered sequence entry\n"
      ."  ID(\"%s\"): %s\n  seq string length (incl. newlines): %d\n", &MySub,
      $KeyId||"''", $sId||"''", &Sum(map{ length($_) }@SeqLn);

  ##############################################################################
  # selection and refinement

    # select by sequence identifier / description
    if ($SlcIdRE and $sId!~m/$SlcIdRE/) { redo }
    if (%MatchID and !$MatchID{$sId}) { redo }
    if ($SlcDescrRE and $SeqEntry{header}!~m/$SlcDescrRE/) { redo }

    # optionally purify sequence
    if ($opt{-pure}) {
      $SeqEntry{sequence} = &SeqStrPure ($SeqEntry{sequence}, -SeqType=>$opt{-pure});
    } else {
      $SeqEntry{sequence} =~ tr/ \t\n\r//d;
      if ($debug) {
        my @strange = grep { ord()<0x20 or ord()>=0x80 } split(//,$SeqEntry{sequence});
        if (int @strange) {
          printf STDERR "%s. WARNING: strange characters in sequence string\n", &MySub;
          printf STDERR "  sequence: %s\n", $sId;
          printf STDERR "  character: %d=0x%X\n", (ord($strange[0])) x 2;
        }
      }
    }

  } # end: loop until successful load

  # return sequence entry reference
  return \%SeqEntry;
}


# format sequence data structure to fastA format (Pearson format)
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
# - options:
#   -header     complete header string with or without line feed. This option
#               overwrites option '-phrase'.
#   -KeyId      [STD]
#   -KeySeq     [STD]
#   -phrase     phrase for construction of header (default: none). A
#               statement specifying the sequence length is added. This
#               option overwrites sequence data field 'header'.
#   -pure       handed over to &_SeqStrCluster -> &SeqStrPure
#               You may specify a sequence type. Then, fuzzy letters are
#               converted to official 'unknowns'.
#   -upper      handed over to &_SeqStrCluster -> &SeqStrPure
#
# - return val: plain fastA format
#
# DESCRIPTION
# - The field "header" will be used for re-output of fastA format, if present.
#   Note that any changes to either "id" or "descr" will be lost unless field
#   "header" is removed.
#
sub SeqentryToFasta {
  my ($pSeq,%opt) = @_;
  my $KeyId  = $opt{-KeyId} || $LibGlob{KeyId};
  my $KeySeq = $opt{-KeySeq} || $LibGlob{KeySeq};

  # work out header
  my $header = $opt{-header} ||
    ($$pSeq{header} and !$opt{-phrase} and $KeyId eq $LibGlob{KeyId}) ?
      $$pSeq{header} : (
        (defined($$pSeq{descr}) and !$opt{-phrase}) ?
          sprintf ('>%s %s', $$pSeq{$KeyId}, $$pSeq{descr}||'') :
          sprintf ('>%s %s%s%d letters', $$pSeq{$KeyId}||'UnknownSeq',
            $opt{-phrase}||$$pSeq{descr}||'',
              ($opt{-phrase} or $$pSeq{descr})?', ':'',
            length($$pSeq{SeqPure}||=&SeqStrPure($$pSeq{$KeySeq}||'')),
            )
    );
  $header =~ s/(\r?\n)*$/\n/;

  # synthesis
  my $SeqPlain = &_SeqStrCluster ($$pSeq{$KeySeq},
    -chars=>60, -blocks=>1, %opt);

  return $header.$SeqPlain;
}


################################################################################
# I/O of pretty HTML
################################################################################


# format sequence data structure to pretty HTML
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
# - options:
#   -BlockLen   clustered sequence in line blocks, default: 10
#   -debug      [STD]
#   -KeyId      [STD]
#   -KeySeq     [STD]
#   -LineLen    sequence per in line, default: 60
#   -PosRef     set reference position for sequence position numbering,
#               default: 0 => counting starts with 1
#
# - return val: pretty HTML format string
#
sub SeqentryToPrettyHtml {
  my ($pSeq,%opt) = @_;
  my $debug = $opt{-debug};
  my $KeyId  = $opt{-KeyId} || $LibGlob{KeyId};
  my $KeySeq = $opt{-KeySeq} || $LibGlob{KeySeq};
  my $LineLenDft = 50;
  my $LineLen  = int($opt{-LineLen}||0)  || $LineLenDft;
  my $BlockLenDft = 10;
  my $BlockLen = int($opt{-BlockLen}||0) || $BlockLenDft;
  $debug and printf STDERR "%s. block format parameters: block %d, line %d\n", &MySub,
    $BlockLen, $LineLen;

  # purify sequence string
  my $sSeqTgt = $$pSeq{sequence};
  $sSeqTgt =~ tr/ \t\n\r//d;
  my $PosMaxchar = &Max (5, map {length("$_")} map {
    my $pos = $_ - int($opt{-PosRef}||0);
    if ($pos>=0) { $pos ++ }
    $pos
    } 0, length($sSeqTgt) );
  $debug and printf STDERR "%s. max. size of numbering column: %d\n", &MySub, $PosMaxchar;

  # join overlapping annotations
  # they are redundant now, because all sub-specifying attributes will be
  # dropped: text, orientation, ...
  #for (my $i=0; $i<@{$$pSeq{annot}}; ++$i) {
  #}

  # header
  my $header;
  {
    my $sDescr = &LineBreak ($$pSeq{descr}||'-', -length=>60);
    $sDescr   =~ s/\n(.)/          $1/g;
    $header  = "<PRE>\n";
    $header .= sprintf "\%-${PosMaxchar}s  \%s\n", 'ID', $$pSeq{id};
    $header .= sprintf "\%-${PosMaxchar}s  \%s\n", 'DESCR', $sDescr;
      # $sDescr will contain a trailing "\n" (see &LineBreak)
  }

  # prepare HTML tags corresponding to annotation borders
  my (@AnnotPrim, @AnnotStack, @AnnotHtml);
  foreach my $pAnnot (@{$$pSeq{annot}}) {
    # positions are in computational notation, here
    if ($$pAnnot{type} =~ m/^#?([0-9A-Fa-f]{6})$/) {
      push @AnnotPrim, {
        flank=>-1, pos=>$$pAnnot{offset}-1,
        tag=>"<FONT COLOR=#$1>" };
      push @AnnotPrim, {
        flank=> 1, pos=>$$pAnnot{end},
        tag=>"</FONT>" };
    } elsif ($$pAnnot{type} =~ m/^\w+$/) {
      push @AnnotPrim, {
        flank=>-1, pos=>$$pAnnot{offset}-1,
        tag=>"<$$pAnnot{type}>" };
      push @AnnotPrim, {
        flank=> 1, pos=>$$pAnnot{end},
        tag=>"</$$pAnnot{type}>" };
    }
  }

  # prepare substrings for block formatting
  for (my $CtI=0; $CtI<length($sSeqTgt); $CtI+=$BlockLen) {
    if ($CtI % $LineLen) {
      push @AnnotPrim, { flank=>0, pos=>$CtI, tag=>" " };
    } else {
      if ($CtI) {
        push @AnnotPrim, { flank=>0, pos=>$CtI, tag=>"\n\n" };
      }
      my $pos = $CtI - int($opt{-PosRef}||0);
      if ($pos >= 0) { $pos ++ }
      push @AnnotPrim, { flank=>-0.5, pos=>$CtI,
        tag=>sprintf("\%${PosMaxchar}d  ", $pos) };
      push @AnnotPrim, { flank=> 0.5, pos=>$CtI, action=>'silent' };
      push @AnnotPrim, { flank=>-0.8, pos=>$CtI, action=>'continue' };
    }
  }

  # sort tags
  @AnnotPrim = sort {
      ($a->{pos} <=> $b->{pos}) or
    ($b->{flank} <=> $a->{flank}) or
    ($a->{flank} * ($a->{tag} cmp $b->{tag})) or
    $a <=> $b } @AnnotPrim;
  $debug and printf STDERR "%s. tags and substrings to enter: %d\n", &MySub, int(@AnnotPrim);

  # no formats on numbering column
  # - iterate through sorted tags
  foreach my $pAnnot (@AnnotPrim) {

    # perform stack action
    if (exists($$pAnnot{action})) {
      # undo tags in stack
      if ($$pAnnot{action} eq 'silent') {
        foreach (reverse @AnnotStack) {
          $_->{tag} =~ m/^<(\w+)/;
          push @AnnotHtml, { flank=>$$pAnnot{flank}, pos=>$$pAnnot{pos},
            tag=>"</$1>" };
        }
      }
      # reactivate tags in stack
      elsif ($$pAnnot{action} eq 'continue') {
        foreach (@AnnotStack) {
          push @AnnotHtml, { flank=>$$pAnnot{flank}, pos=>$$pAnnot{pos},
            tag=>$_->{tag} };
        }
      }
      next;
    }

    # update stack of active tags
    elsif ($$pAnnot{tag} =~ m/^<\w+/) {
      push @AnnotStack, $pAnnot;
    }
    elsif ($$pAnnot{tag} =~ m/^<\/\w+/) {
      pop @AnnotStack;
    }

    # format sequence string
    push @AnnotHtml, $pAnnot;
  }

  # format sequence with HTML tags
  # - by using reverse() rightmost annotation tag will be entered first
  # - remember: positions are in computational notation
  foreach my $pAnnot (reverse @AnnotHtml) {
    substr($sSeqTgt,$$pAnnot{pos},0) = $$pAnnot{tag};
  }

  $sSeqTgt .= "\n</PRE>\n";
  return $header . $sSeqTgt;
}


################################################################################
# I/O of Experiment file format (Staden package)
################################################################################


# read Staden Experiment file into sequence data structure
#
# INTERFACE
# - argument 1: path of source file.
#               - Don't expect the code to work with '-' as an alias for \*STDIN
#               - In addition to the specification here, a source filehandle
#                 may, and should, be specified via switch -handle. Though, the
#                 path argument enables the function to enter a value for the
#                 according data field.
#
# - options:
#   -ClipQuality
#               return only quality range of sequence string
#   -ClipUnk    clip unknown emissions at the end of the sequence string
#   -debug      [STD]
#   -fast       parse file quick and dirty. This option is recommended if only
#               the basic information (ID, sequence) is needed and the data
#               is not meant to be written back to a highly informative
#               file format.
#               This option doesn't exclude use of option -pure. But it won't
#               work in conjunction with option -ClipQuality.
#   -handle     reference type GLOB, FileHandle
#               - cf. description for arg1
#               - unless this option is used, it's impossible to read multiple
#                 sequences from a multi-sequence file via this function.
#               - If the filehandle is not seekable (e.g. input from pipe) then
#                 it should be an OO FileHandle that supports multi-byte unget
#                 via method ungets(). This is fulfilled by CPAN's
#                 FileHandle::Unget.
#   -KeyId      [STD]
#   -MatchID    select by matching to sequence identifier. Selector may be:
#               ARRAY ref  list of sequence identifiers (not regexps!)
#                          Hash is preferred due to faster performance.
#               HASH ref   keyed list of sequence identifiers (not regexps!)
#   -pure       purify sequence strings for sequence-coding letters
#               - You may specify a sequence type. Then, fuzzy letters are
#                 converted to official 'unknowns'.
#               - Positional references are adjusted.
#               - Upper/lower-case appearance of the sequence string remains
#                 unchanged.
#               - Warning: fields AV, ON will be lost
#   -SlcID      select by regexp applied to sequence identifier
#
# - return val: - reference to sequence data structure
#               - undef if there's no additional sequence entry
#               - undef if an error occurred
#
# DESCRIPTION
# - The length entry (field label 'LE') is deleted because of its redundancy.
# - comments (label 'CC') are disregarded. They make sense only in the ordered
#   context of the file format. But the data structure is unordered!
# - special attention is addressed to the tag (annotation) fields. As far as
#   possible they're parsed into the array 'annot'. Special handling for
#   special tag fields:
#   CS   though a special field exists for this type of tag, it's also parsed
#        to the standard annotation array. GAP4 will also recognize it if
#        encoded as 'TG   CVEC ...'.
#   SL   not added to standard annotation array
#   SR   not added to standard annotation array
# - Field SrcMulti reflects the situation quite sensitive. However, erroneous
#   or filtered sequence entries that're preceding or following the returned
#   sequence entry will be counted as additional entries. So, there may be
#   some cases of SrcMulti==1 where only one entry was retrieved from the
#   sequence file (after selection and error skipping).
#
# DEBUG, CHANGES, ADDITIONS
# - fields AV, ON will be lost in conjunction with option -pure
#
sub SeqentryPopExper {
  my ($PathIn,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2 = $debug ? $debug-1 : undef;
  my $KeyId = $opt{-KeyId} || $LibGlob{KeyId};

  # filehandle from input argument
  if (!$opt{-handle} and $opt{-debug}) {
    printf STDERR "%s. WARNING: sequential reading does not make sense with a non-handle input argument\n", &MySub;
  }
  my $hIn = $opt{-handle} || FileHandle->new($PathIn);
  if (ref($hIn) eq 'GLOB') {
    $hIn = FileHandle->new_from_fd($hIn,'r') or die sprintf "%s. fdup ERROR", &MySub;
  }

  # prepare selectors from selector arguments
  my (%MatchID,$SlcIdRE);
  if      (ref($opt{-MatchID}) eq 'HASH') {
    %MatchID = %{$opt{-MatchID}};
    $debug and printf STDERR "%s. using hash selector for identifiers:\n  %s\n", &MySub,
      &ListMaxfirst([keys %MatchID],5,-ElemExceed=>'...',-join=>' ');
  } elsif (ref($opt{-MatchID}) eq 'ARRAY') {
    %MatchID = map { ($_,1) } @{$opt{-MatchID}};
    $debug and printf STDERR "%s. using hash selector for identifiers (built from list):\n  %s\n", &MySub,
      &ListMaxfirst([keys %MatchID],5,-ElemExceed=>'...',-join=>' ');
  }
  if ($opt{-SlcID}) {
    $SlcIdRE = $opt{-SlcID};
    $debug and printf STDERR "%s. using regexp selector for identifiers: %s\n", &MySub, $SlcIdRE;
  }

  ##############################################################################
  # parsing

  # multi-sequence input?
  my $bSrcMulti = int ($hIn->tell()>0);

  # loop until successful load of one entry
  my $pSeqFin;
  {
    # parse line by line
    my (%SeqPrim,@SeqPrimSeq);
    my ($line,$lineo);
    while (defined ($line=$lineo=<$hIn>)) {
      chomp $line;

      if ($line =~ m/^([A-Z]{2})( {3}|$)/) {
        my ($label,$content) = ($1,$');
        if (0) { }

        # parse tag
        elsif ($label =~ m/^T[CG]$/) {

          # grab annotation text of current tag entry
          my $AnnotPlain = '';
          while (defined($line=$lineo=<$hIn>) and $line=~m/^T[CG] {8}/) {
            $AnnotPlain .= $';
          }
          chomp $line;

          # parse tag into sequence annotation data structure
          my ($pAnnot);
          if ($content =~ m/^([A-Z0-9_]{4}) ([b+=-]) (\d+)\.\.(\d+)(?: "(.+)")? *$/
            and $3>0 and $4>0
          ) {
            $pAnnot = {
              type   => $1,
              orient => ($2 eq 'b' or $2 eq '=') ? 0 : int($2.'1'),
              offset => $3,
              end    => $4,
              text   => ($5||'') . $AnnotPlain,
              layer  => ($label eq 'TC') ? 'cons' : undef,
                     # contig tags and read tags are organized separate layers
              };
          } else {
            printf STDERR "%s. ERROR in parsing tag line, seq %s: %s\n", &MySub,
              $SeqPrim{ID}||"''", $content||"''";
            redo;  # turn to next line which is already read from file handle
          }

          # nice text string
          $$pAnnot{text} =~ s/\n*$//;

          # enter into array of annotations
          push @{$SeqPrim{annot}||=[]}, $pAnnot;

          # debug
          if ($debug) {
            printf STDERR "%s. seq %s, entered sequence annotation (%s):\n", &MySub,
              $SeqPrim{ID}||"''", $$pAnnot{layer}||'reading';
            printf STDERR "  label: %s\n", $$pAnnot{type};
            printf STDERR "  range: %d..%d\n", $$pAnnot{offset}, $$pAnnot{end};
            printf STDERR "  text: %s\n", $$pAnnot{text};
          }
          redo;  # turn to next line which is already read from file handle
        }

        # parse tag 'CS' into sequence annotation data structure
        elsif ($label eq 'CS') {
          $content =~ m/^(\d+)\.{2}(\d+)$/;
          push @{$SeqPrim{annot}||=[]},  {
            type   => 'CVEC',
            orient => 1,
            offset => $1,
            end    => $2,
            };
          $debug and printf STDERR "%s. entered sequence annotation for tag 'CS'\n", &MySub;
        }

        # enter multi-line fields as an untouched array
        elsif ($label =~ m/^(AV|ON)$/) {
          push @{$SeqPrim{$label}||=[]}, $content;
        }

        # strip description from CC tag
        elsif ($label eq 'CC') {
          if ($content =~ m/^description: (.+)$/) {
            my $sDescr = $1;
            $SeqPrim{descr} .= (($SeqPrim{descr} and $SeqPrim{descr}!~m/ +$/) ? ' ':'') . $sDescr;
          }
        }

        # skip these, that data do not make sense in sequence data structure
        elsif ($label =~ m/^(XX|LE|YY)$/) { next }

        # completed current entry?
        # - reposition filehandle read position onto next sequence entry
        elsif ($label eq 'ID' and $SeqPrim{ID}) {
          $debug and printf STDERR "%s. found offset of next entry %s, input line %d\n", &MySub,
            $content, $.;
          if ($hIn->can('ungets')) {
            $debug and printf STDERR "%s. unreading via FileHandle->ungets()\n", &MySub;
            $hIn->ungets($lineo);
          } else {
            $debug and printf STDERR "%s. unreading via seek(), rel %d\n", &MySub,
              -length($lineo);
            seek $hIn, -length($lineo), 1;
          }
          last;
        }

        # parse sequence string
        elsif ($label eq 'SQ') {
          while (defined($line=$lineo=<$hIn>) and $line!~m|^//|) {
            push @SeqPrimSeq, $line;
          }
        }

        # enter field untouched
        else {
          $SeqPrim{$label} = $content;
          $debug and printf STDERR "%s. entered field %s, input line %d\n", &MySub, $label, $.;
        }
      }

      # error handling
      elsif ($line =~ m/^$/) { next }  # skip empty line
      else {                           # line not matching to 'label   text'
        $debug and printf STDERR "%s. ERROR: non-parsable line in Experiment File: $line\n", &MySub;
      }

    } # end while - line loop

    # any entry parsed?
    unless (exists $SeqPrim{ID}) { return undef }

  ##############################################################################
  # refinement and selection

    # do basic conversions
    $pSeqFin = {
      $KeyId   => $SeqPrim{ID},
      sequence => join('',@SeqPrimSeq),
      SrcPath  => (-f $PathIn) ? &PathExpand($PathIn) : $PathIn,
      SrcFmt   => 'Experiment',
      SrcMulti => $bSrcMulti || int(!$hIn->eof()),
      };
    delete $SeqPrim{ID};
    unless ($opt{-fast}) { $pSeqFin = { %$pSeqFin,%SeqPrim } }
    $$pSeqFin{sequence} =~ tr/-* \n/N-/d;
    if ($debug) {
      printf STDERR "%s. result of parsing sequence %s\n", &MySub, $$pSeqFin{$KeyId}||"''";
      print  STDERR &_SeqStrCluster ($$pSeqFin{sequence}, -chars=>60, -blocks=>1);
    }

    # select by sequence identifier / description
    if ($SlcIdRE) {
      if ($$pSeqFin{$KeyId} !~ m/$SlcIdRE/) { redo }
    }
    if (%MatchID) {
      unless ($MatchID{$$pSeqFin{$KeyId}}) { redo }
    }

  } # end loop until successful load

  ##############################################################################
  # refinement

  # refine quick and dirty
  if ($opt{-fast}) {
    $debug and printf STDERR "%s. refinement fast, purification %s\n", &MySub,
      $opt{-pure} ? 'YES':'NO';
    $opt{-ClipQuality} and
      $pSeqFin = &_SeqExperClipQual ($pSeqFin, -debug=>$debug);
    $opt{-pure} and
      $$pSeqFin{sequence} = &SeqStrPure ($$pSeqFin{sequence}, -SeqType=>$opt{-pure});
    $opt{-ClipUnk} and
      $pSeqFin = &SeqClipUnk ($pSeqFin, -debug=>$dbg2);
  }

  # do complex conversions
  else {
    $debug and printf STDERR "%s. refinement standard, purification %s\n", &MySub,
      $opt{-pure}||'NO';
    &_SeqExperOptimize ($pSeqFin, -KeyId=>$opt{-KeyId}, -debug=>$debug);
    #if ($$pSeqFin{$KeyId} =~ m/$reSeqid{GscjRd}/o) {
    #  &_SeqExperDicty ($pSeqFin, -KeyId=>$opt{-KeyId}, -debug=>$debug);
    #}
    $opt{-ClipQuality} and
      $pSeqFin = &_SeqExperClipQual ($pSeqFin, -debug=>$debug);
    $opt{-pure} and
      $pSeqFin = &SeqPure ($pSeqFin, -SeqType=>$opt{-pure}, -debug=>$dbg2);
    $opt{-ClipUnk} and
      $pSeqFin = &SeqClipUnk ($pSeqFin, -debug=>$dbg2);
  }

  # return sequence entry reference
  $debug and printf STDERR "%s. returning sequence entry\n", &MySub;
  return $pSeqFin;
}


# optimize Experiment file data structure
#
# INTERFACE
# - argument 1: reference to sequence data structure
#               referenced data is explicitly changed
#
# - options:
#   -debug      [STD]
#   -KeyId      [STD]
#
sub _SeqExperOptimize {
  my ($pSeq,%opt) = @_;
  my $KeyId = $opt{-KeyId} || $LibGlob{KeyId};

  # identity
  $$pSeq{EN} || ($$pSeq{EN} = $$pSeq{$KeyId} || '-');

  # cloning / sequencing vector information
  foreach ('C', 'S') {
    if ($$pSeq{$_.'F'}) {

      # no information for CF/SF? => delete it!
      if ($$pSeq{$_.'F'} =~ /^(-|unknown)$/) {
        delete $$pSeq{$_.'F'};
      }

      # carry over info from CF/SF to CV/SV
      elsif ($$pSeq{$_.'V'} =~ /^(-|unknown)$/) {
        $$pSeq{$_.'V'} = $$pSeq{$_.'F'};
        $$pSeq{$_.'V'} =~ s/\..*$//;
      }
    }

    # no information for CV/SV? => delete it!
    if (!$$pSeq{$_.'F'} and $$pSeq{$_.'V'} and $$pSeq{$_.'V'} =~ /^(-|unknown)$/) {
      delete $$pSeq{$_.'V'};
    }
  }

  # sort annotations
  @{$$pSeq{annot}} = &AnnotSort (@{$$pSeq{annot}});

  # sequencing vector positions
  if (exists($$pSeq{SL}) and $$pSeq{SL}<=1) {
    delete $$pSeq{SL};
  }
  if (exists($$pSeq{SR}) and $$pSeq{SR}<=0) {
    delete $$pSeq{SR};
  }
  if (exists($$pSeq{SR}) and exists($$pSeq{SL}) and $$pSeq{SR}<=$$pSeq{SL}) {
    delete $$pSeq{SR};
  }
  if (exists($$pSeq{SR}) and $$pSeq{SR}>length($$pSeq{sequence})) {
    delete $$pSeq{SR};
  }
}


# optimize Experiment file data structure for a Dictyostelium Gap4 project
#
# INTERFACE
# - argument 1: reference to sequence data structure
#               referenced data is explicitly changed
#
sub _SeqExperDicty {
  my ($pSeq,%opt) = @_;

  # clone name
  if (exists($$pSeq{CN}) and $$pSeq{CN}=~m/^(AX4|Chr2)[a-z]$/) {
    $$pSeq{CN} = $1;
  }
  if (exists($$pSeq{CF}) and $$pSeq{CF}=~m/^(PAC)/) {
    delete $$pSeq{CF};
  }

  # sequencing vector and quality range
  if ($$pSeq{SV}) {
    $$pSeq{SV} =~ s/(puc18|unknown)/pUC18/;
  } else {
    $$pSeq{SV} = 'pUC18';
  }
  if ($$pSeq{SL} and $$pSeq{QL} and $$pSeq{QL}<$$pSeq{SL}) {
    $$pSeq{QL} = $$pSeq{SL};
  }

  # sequence string
  $$pSeq{sequence} =~ tr/BDHKMRSVWY/NNNNNNNNNN/;
  $$pSeq{sequence} =~ tr/L/N/;   # change some curious mis-edits
}


# expand sequence attributes in Experiment file data structure
#
# INTERFACE
# - argument 1: reference to sequence data structure
#               referenced data is explicitly changed
#
# - options:
#   -debug      [STD]
#
sub SeqExperExpandSeqattrib {
  my ($pSeq,%opt) = @_;

  # AV := quality attributes
  my $AV = join('', @{$pSeq->{AV}||[]});
  $pSeq->{AV_expand} = [ split /\s+/,$AV ];
  if (int(@{$pSeq->{AV_expand}}) != length($pSeq->{sequence})) {
    printf STDERR "WARNING: expanded AV does not match sequence length, seq %s\n",
      $pSeq->{id};
  }

  # ON := original position numbering
  my $ON = join('', @{$pSeq->{ON}||[]});
  $ON =~ s/\b1\.\.0\b/1 0/;
    # fix BUG of Experiment output (GAP4 export directed assembly)
  $ON =~ s/(\d+)\.\.(\d+)/join(' ',$1..$2)/eg;
  $pSeq->{ON_expand} = [ split /\s+/,$ON ];
  if (int(@{$pSeq->{ON_expand}}) != length($pSeq->{sequence})) {
    printf STDERR "WARNING: expanded ON does not match sequence length, seq %s\n",
      $pSeq->{id};
  }
}


# cut out quality range of sequence string
#
# INTERFACE
# - argument 1: reference to sequence data structure
#               referenced data will be left unchanged
#
# - options:
#   -debug      [STD]
#
# - return val: reference to sequence data structure
#
# DESCRIPTION
# - all position values are counted in biological system.
#
sub _SeqExperClipQual {
  my ($pSeq,%opt) = @_;
  my $debug = $opt{-debug};

  ##############################################################################
  # determine effective quality range

  # determine clip positions
  # - positions refer to the most inward range position
  #   (ranges always extend to the sequence end)
  my $iSeqLen = length $$pSeq{sequence};
  my $QualLeft  = &Max ($$pSeq{QL}, $$pSeq{SL}, 0);
  my $QualRight = &Min ($$pSeq{QR} || $iSeqLen+1, $$pSeq{SR} || $iSeqLen+1);
  $debug and printf STDERR "%s. clip positions, 1st pass: %d left, %d right, seq length %d\n", &MySub,
    $QualLeft, $QualRight, $iSeqLen;

  # positions for invalid SL/SR positions
  if ($QualLeft >= $QualRight-1) {
    $QualLeft  = &Max ($$pSeq{QL}, 0);
    $QualRight = &Max ($QualLeft+1, $$pSeq{QR} || $iSeqLen+1);
    if ($$pSeq{SL} > $QualLeft and $$pSeq{SL} < $QualRight) {
      $QualLeft = $$pSeq{SL};
    }
    if ($$pSeq{SR} < $QualRight and $$pSeq{SR} > $QualLeft) {
      $QualRight = $$pSeq{SR};
    }
    $debug and printf STDERR "%s. clip positions, 2nd pass: %d left, %d right, seq length %d\n", &MySub,
      $QualLeft, $QualRight, $iSeqLen;
  }

  # final check
  if (!$QualLeft and $QualRight>$iSeqLen) {
    $debug and printf STDERR "%s. no effective clip positions in seq %s, nothing to do!\n", &MySub, $$pSeq{id}||"''";
    return $pSeq;
  }

  ##############################################################################
  # apply range to sequence entry

  # select true effective quality range
  if ($QualLeft < $QualRight-1) {

    # do true clip
    $pSeq = &SeqRange ($pSeq, $QualLeft+1, $QualRight-1, -debug=>$debug);
    $$pSeq{descr} =~ s/r(ange \d+\.\.\d+)$/QualityR$1/;
  }

  # NULL quality range
  else {
    $debug and printf STDERR "%s. WARNING: NULL quality range, seq %s\n", &MySub, $$pSeq{id}||"''";
    $pSeq = &DataClone ($pSeq);
    foreach (qw(sequence QL QR SL SR annot)) {
      delete $$pSeq{$_};
    }
    $$pSeq{descr} .= ($$pSeq{descr}?', ':'') . 'QualityRange NULL';
  }

  # return sequence entry reference
  return $pSeq;
}


# format sequence data structure ID entries to prepare it for output
# in Experiment file format
#
# INTERFACE
# - argument 1: reference to sequence data structure
#               data is explicitly changed
#
# - options:
#   -KeyId      [STD]
#
# - return val: reference to sequence data structure (same as argument 1)
#
sub SeqExperID {
  my ($pSeq,%opt) = @_;
  my $KeyId = $opt{-KeyId} || $LibGlob{KeyId};

  # shorten long GenBank identifier line
  if ($$pSeq{$KeyId} =~ m/\|/ and $' =~ m/^[\w.-]+/) {
    $$pSeq{$KeyId} = $&;
  }

  # apply restrictions in identifier syntax
  foreach ($KeyId, 'EN', 'TN') {
    defined $$pSeq{$_} and $$pSeq{$_} =~ s/[^a-zA-Z0-9_.-]+//g;
  }

  # return sequence entry reference
  return $pSeq;
}


my @_SeqExperAnnotOrder = qw( ID EN TN DR QL QR annot TC TG ST SI SV CN CV CH PR ON AV AP sequence SQ );
our %_SeqExperAnnotOrderdic;
for (my $i=0; $i<int(@_SeqExperAnnotOrder); ++$i) {
  $_SeqExperAnnotOrderdic{$_SeqExperAnnotOrder[$i]} = $i;
}

# format sequence data structure to Experiment file format
#
# INTERFACE
# - argument 1: reference to sequence data structure
#               referenced data is left unchanged
#
# - options:
#   -debug      [STD]
#   -KeyId      [STD]
#   -KeySeq     [STD]
#   -pure       purify sequence strings for sequence-coding letters.
#               You may specify a sequence type. Then, fuzzy letters are
#               converted to official 'unknowns'.
#               Positional references are adjusted.
#               Upper/lower-case appearance of the sequence string remains
#               unchanged.
#
# - return val: plain Experiment file format
#
sub SeqentryToExper {
  my ($pSeq,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2 = $debug ? $debug-1 : undef;
  my $KeyId = $opt{-KeyId} || $LibGlob{KeyId};
  my $KeySeq = $opt{-KeySeq} || $LibGlob{KeySeq};
  $pSeq = &DataClone($pSeq);
  $debug and printf STDERR "%s. working on seq %s\n", &MySub, $$pSeq{$KeyId}||"''";

  # apply restrictions in identifier syntax
  &SeqExperID ($pSeq,%opt);

  # purify sequence string
  if ($opt{-pure}) {
    $pSeq = &SeqPure ($pSeq, -SeqType=>$opt{-pure}, -debug=>$dbg2);
  }

  # print this first: ID, EN, TN, ...
  my $ExperPlain .= sprintf ("ID   %s\n", $$pSeq{$KeyId} || '-');
  foreach (qw(EN TN)) {
    if ($$pSeq{$_}) {
      $ExperPlain .= sprintf ("$_   %s\n", $$pSeq{$_});
      delete $$pSeq{$_};
    }
  }
  my $sId = $$pSeq{$KeyId};
  delete $$pSeq{$KeyId};

  # prepare annotations for output in Experiment format
  # here, we only need the complex range split feature of &AnnotExpandTscr
  if ($$pSeq{annot} and $$pSeq{SrcFmt} eq 'GenBank') {
    &AnnotExpandTscr ($pSeq);
  }

  # resolve nonsense quality clip
  if (exists($$pSeq{QL}) and exists($$pSeq{QR}) and $$pSeq{QR}-$$pSeq{QL}<=1) {
    $$pSeq{QL} = &Min (length($$pSeq{$KeySeq})-2, 40);
    $$pSeq{QR} = &Min (length($$pSeq{$KeySeq}), 42);
  }

  # resolve expanded ON data
  if (exists($$pSeq{ON_expand})) {
    $$pSeq{ON}=[];
    my $ON='';
    my $i;
    for ($i=0; $i<int(@{$$pSeq{ON_expand}}); ++$i) {
      my $n=$$pSeq{ON_expand}[$i]+1;
      my $j=$i+1;
      for ( ; $j<int(@{$$pSeq{ON_expand}}) and $$pSeq{ON_expand}[$j]==$n; ++$j,++$n) { }
      --$j; --$n;
      if ($j>$i) {
        $$pSeq{ON_expand}[$i] .= '..' . $n;
        splice @{$$pSeq{ON_expand}}, $i+1, $j-$i;
      }
    }
    for ($i=0; $i<int(@{$$pSeq{ON_expand}}) and length($ON)<=60; ++$i) {
      $ON .= $$pSeq{ON_expand}[$i] . ' ';
    }
    for ( ; $i<int(@{$$pSeq{ON_expand}}); ++$i) {
      if (length($ON) > 59) {
        push @{$$pSeq{ON}}, $ON;
        $ON=' 'x 5;
      }
      $ON .= $$pSeq{ON_expand}[$i] . ' ';
    }
    push @{$$pSeq{ON}}, $ON;
  }

  # resolve expanded AV data
  if (exists($$pSeq{AV_expand})) {
    $$pSeq{AV}=[];
    my $AV='';
    my $i=0;
    for ( ; length($AV)<=60 and $i<int(@{$$pSeq{AV_expand}}); ++$i) {
      $AV .= $$pSeq{AV_expand}[$i] . ' ';
    }
    for ( ; $i<int(@{$$pSeq{AV_expand}}); ++$i) {
      if (length($AV)>60) {
        push @{$$pSeq{AV}}, $AV;
        $AV=' 'x 5;
      }
      $AV .= $$pSeq{AV_expand}[$i] . ' ';
    }
    push @{$$pSeq{AV}}, $AV;
  }

  # loop over remaining fields
  # - annotations last
  # - rest: sort alphabetically
  my $iAnnotOrderDft=int(@_SeqExperAnnotOrder);
  foreach my $field (sort { ($_SeqExperAnnotOrderdic{$a}||$iAnnotOrderDft)<=>($_SeqExperAnnotOrderdic{$b}||$iAnnotOrderDft) or lc($a) cmp lc($b) } keys %$pSeq) {

    # print sequence string
    if ($field eq $KeySeq and $$pSeq{$field}) {
      my $sSeq = $$pSeq{$field};
      $sSeq =~ tr/N-/-*/;
      $sSeq = &_SeqStrCluster ($sSeq, -chars=>10, -blocks=>6, -indent=>5);
      $ExperPlain .= "SQ   \n$sSeq//\n";
    }

    # print description, encoded as a CC tag
    elsif ($field eq 'descr') {
      $debug and printf STDERR "%s. seq %s, description entry\n", &MySub, $sId;
      $ExperPlain .= sprintf ("CC   description: %s\n", $$pSeq{$field});
    }

    # format annotations
    elsif ($field eq 'annot') {
      $debug and printf STDERR "%s. seq %s, %d annotation%s\n", &MySub, $sId,
        int(@{$$pSeq{annot}}), (@{$$pSeq{annot}}==1) ? '':'s';
      foreach my $pAnnot (&AnnotSort (@{$$pSeq{annot}})) {

        # quality range
        if ($$pAnnot{layer} and $$pAnnot{layer} eq 'qual') { }

        # vector ranges
        elsif ($$pAnnot{layer} and $$pAnnot{layer} eq 'VectLeft') { }
        elsif ($$pAnnot{layer} and $$pAnnot{layer} eq 'VectRight') { }

        # annotations output in tag (TC or TG) format
        else {
          my $AnnotLabel = ($$pAnnot{layer} and $$pAnnot{layer} eq 'cons') ? 'TC' : 'TG';
          $debug and printf STDERR "%s. annotation: label %s, text %d chars\n", &MySub,
            $$pAnnot{type}, length($$pAnnot{text});
          $ExperPlain .= sprintf ("%s   %s %s %d..%d\n",
            $AnnotLabel, $$pAnnot{type},
            &SignChar ($$pAnnot{orient}, -allow0=>'b'),
            $$pAnnot{offset}, $$pAnnot{end});
          $$pAnnot{text} and $ExperPlain .= join ('',
            map { sprintf ("%s%s%s\n", $AnnotLabel, ' 'x8, $_) }
            split (/\n/, $$pAnnot{text}));
        }
      }
    }

    # print any untouched field
    elsif ($field =~ m/^[A-Z]{2}$/) {
      if (ref($$pSeq{$field}) eq 'ARRAY') {
        $ExperPlain .= "$field   ". join("\n$field   ",@{$$pSeq{$field}}) ."\n";
      } else {
        $ExperPlain .= "$field   $$pSeq{$field}\n";
      }
    }

  } # foreach $field

  return $ExperPlain;
}


################################################################################
# I/O of GenBank format
################################################################################


# read next sequence data structure entry from GenBank file
#
# INTERFACE
# - argument 1: path of source file.
#               - Don't expect the code to interpret '-' as an alias for \*STDIN
#               - In addition to the specification here, a source filehandle
#                 may, and should, be specified via switch -handle. Though, the
#                 path argument enables the function to enter a value for the
#                 according data field.
#
# - options:
#   -debug      [STD]
#   -fast       parse file quick and dirty: don't parse sequence source /
#               annotation / promoter-relevant information.
#   -handle     reference type GLOB, FileHandle
#               - cf. description for arg1
#               - unless this option is used, it's impossible to read multiple
#                 sequences from a multi-sequence file via this function.
#   -KeyId      [STD]
#   -MatchID    select by matching to sequence identifier. Selector may be:
#               ARRAY ref  list of sequence identifiers (not regexps!)
#                          Hash is preferred due to faster performance.
#               HASH ref   keyed list of sequence identifiers (not regexps!)
#   -pure       note: GenBank sequences are always pure
#   -SlcID      select by regexp applied to sequence identifier
#   -SlcDescr   select by regexp applied to sequence description field
#   -SlcType    select by sequence type, option -fast will be
#               switched off automatically.
#
# - return val: - reference to sequence entry
#               - undef if there's no additional sequence entry
#               - undef if an error occurred
#
# DEVELOPER'S NOTES
# - Data field SrcMulti reflects the situation quite sensitive. However,
#   erroneous or filtered sequence entries that're preceding or following the
#   returned sequence entry will be counted as additional entries. So, there may
#   be some cases of SrcMulti==1 where only one entry was retrieved from the
#   sequence file (after selection and error skipping).
#
sub SeqentryPopGenbank {
  my ($PathIn,%opt) = @_;
  my $debug = $opt{-debug};
  my $KeyId = $opt{-KeyId} || $LibGlob{KeyId};

  # filehandle from input argument
  if (!$opt{-handle} and $opt{-debug}) {
    printf STDERR "%s. WARNING: sequential reading does not make sense with a non-handle input argument\n", &MySub;
  }
  my $hIn = $opt{-handle} || FileHandle->new($PathIn);
  if (ref($hIn) eq 'GLOB') {
    $hIn = FileHandle->new_from_fd($hIn,'r') or die sprintf "%s. fdup ERROR", &MySub;
  }

  # prepare selectors from selector arguments
  my (%MatchID, $SlcIdRE, $SlcDescrRE);
  if      (ref($opt{-MatchID}) eq 'HASH') {
    %MatchID = %{$opt{-MatchID}};
    $debug and printf STDERR "%s. using hash selector for identifiers:\n  %s\n", &MySub,
      join (' ', keys %MatchID);
  } elsif (ref($opt{-MatchID}) eq 'ARRAY') {
    %MatchID = map { ($_,1) } @{$opt{-MatchID}};
    $debug and printf STDERR "%s. using hash selector for identifiers (built from list):\n  %s\n", &MySub,
      join (' ', keys %MatchID);
  }
  if ($opt{-SlcID}) {
    $SlcIdRE = $opt{-SlcID};
    $debug and printf STDERR "%s. using regexp selector for identifiers: %s\n", &MySub, $SlcIdRE;
  }
  $SlcDescrRE = $opt{-SlcDescr};
  $opt{-SlcType} and delete $opt{-fast};

  ##############################################################################
  # parsing

  # multi-sequence input?
  my $bSrcMulti = int ($hIn->tell()>0);

  # loop until successful load of one entry
  my $pSeq;
  {
    my $PlnEntry = do {
      my @buffer;
      {
        unless (defined ($_=$hIn->getline())) {
          $debug and printf STDERR "%s. ERROR: premature end of file, buffer:\n%s", &MySub, @buffer;
          return undef;
        }
        if (! m|^//$|) {
          push @buffer, $_;
          redo;
        }
      }
      join ('', @buffer);
    };

    # read basic sequence information
    $pSeq = &_Genbank2Seq ($PlnEntry, -KeyId=>$KeyId, -debug=>$debug);
    if (!$$pSeq{sequence} or !$$pSeq{$KeyId}) {
      unless (int(%$pSeq)) {
        printf STDERR "%s. ERROR: missing primary sequence info (id, seq)\n", &MySub;
      } elsif (!$$pSeq{$KeyId}) {
        printf STDERR "%s. ERROR: missing sequence ID (idnum %d, seq: %s)\n", &MySub, $$pSeq{idnum}||"''", $$pSeq{sequence}||"''";
      } elsif (!$$pSeq{sequence}) {
        printf STDERR "%s. WARNING: sequence entry without sequence information (ID: %s)\n", &MySub, $$pSeq{sequence}||"''";
      }
    }
    $debug and printf STDERR "%s. entered sequence entry with ID %s\n", &MySub, $$pSeq{$KeyId}||"''";

    # select by sequence identifier / description
    if ($SlcIdRE) {
      if ($$pSeq{$KeyId} !~ m/$SlcIdRE/) { redo }
    } elsif (%MatchID) {
      unless ($MatchID{$$pSeq{$KeyId}}) { redo }
    }
    if ($SlcDescrRE and $$pSeq{header}!~m/$SlcDescrRE/) { redo }

    # read advanced sequence information
    unless ($opt{-fast}) {
      %$pSeq = (%$pSeq, map{ %{$_||{}} } &_Genbank2Source($PlnEntry,-debug=>$debug));
      $debug and printf STDERR "%s. sequence type is %s\n", &MySub, $$pSeq{SeqType}||"''";
      $$pSeq{annot} = &_Genbank2Annot ($PlnEntry,%opt);
    }

    # select by sequence type
    if ($opt{-SlcType} and $$pSeq{SeqType} ne $opt{-SlcType}) {
      $debug and printf STDERR "%s. entry skipped due to sequence type\n", &MySub;
      redo;
    }

    # enter additional fields, modify fields
    if (length ($PathIn)) {
    $$pSeq{SrcPath}  = (-f $PathIn) ? &PathExpand($PathIn) : $PathIn;
    }
    $$pSeq{SrcFmt}   = 'GenBank';
    $$pSeq{SrcMulti} = $bSrcMulti || int(!$hIn->eof());

    # purification of sequence string is done in &_Genbank2Seq
  }

  # return sequence entry reference
  return $pSeq;
}


# extract data from GenBank format - basic sequence data
#
# INTERFACE
# - argument 1: GenBank plain text
#
# - options:
#   -debug      [STD]
#   -KeyId      [STD]
#
# - return val: - reference to sequence data structure, data entries:
#                 id           GenBank Accession Number if given
#                 idnum        gid number (remember GenBank ID format 'gi|\d+)
#                 descr        seq description
#                 sequence     seq string
#               - undef if an error occurs
#
sub _Genbank2Seq {
  my ($GbPlain,%opt) = @_;
  my $debug = $opt{-debug};
  my $KeyId = $opt{-KeyId} || $LibGlob{KeyId};

  # get fields: description, id, sequence
  my (%GbSeq);
  if ($GbPlain !~ m#\nDEFINITION +(.+?)\n\b(.+?\n)?ACCESSION {3}(\w+).*?\n(VERSION {5}\w+(\.\d+)? {2}GI:(\d+)\n)?.+?\nORIGIN.*?\n(.+?)\n(//|$)#os) {
    $debug and printf STDERR "%s. ERROR: RegExp failed to match\n", &MySub;
    return undef;
  }
  $GbSeq{$KeyId} = $3;
  $GbSeq{idnum} = $5;
  $GbSeq{descr} = $1;
  $GbSeq{sequence} = $7;

  # format description string
  $GbSeq{descr} =~ tr/\t\n\r/ /;
  $GbSeq{descr} =~ tr/ / /s;
  $GbSeq{descr} =~ s/(^\s*|\s*$)//g;

  # format sequence string
  $GbSeq{sequence} = &SeqStrPure ($GbSeq{sequence});

  return \%GbSeq;
}


# extract data from GenBank format - source information
#
# INTERFACE
# - argument 1: GenBank plain text
#
# - options:
#   -debug      [STD]
#
# - return val: - reference to sequence entry additions, data fields:
#                 date         date of submission (latest version of entry)
#                 date1st      date of first submission (creation of entry)
#                              *** not implemented ***
#                 medline      reference to array of Medline publications
#                              (ID numbers)
#                 organelle    from source information section at beginning
#                              of feature table
#                 organism     from source information section at beginning
#                              of feature table
#                 OrganismNum  from source information section at beginning
#                              of feature table
#                 publ         text of first informative publication
#                 SeqCateg     GenBank sequence category
#                 SeqType      as stated in GenBank header
#                 SeqMolType   from source information section at beginning
#                              of feature table
#                 submit       info "direct submission" (creation of entry)
#                 taxa         ordered list of taxa holding current organism
#               - undef if an error occurs
#
sub _Genbank2Source {
  my ($GbPlain,%opt) = @_;
  my $debug = $opt{-debug};

  # function parameters
  my $TmpID;
  if ($GbPlain=~m/\nACCESSION {3}(\w+)/) { $TmpID=$1 }

  # get fields: sequence type, category, date
  my %GbSrc;
  $GbPlain =~ m|^LOCUS {7}.{24}(\w+)|s;
  $GbSrc{SeqType} = $1;
  $GbPlain =~ m|^LOCUS {7}.{50} *(\w+) +([\w-]+)\n|s;
  $GbSrc{SeqCateg} = $1;
  $GbSrc{date} = $2;

  # get fields: organism, taxa - possible pitfalls:
  # - HTML-linked organism name
  # - phrase before the linked organism name
  if ($GbPlain =~ m/\n {2}ORGANISM {2}(.+?)\n\s+(.+?)\n\b/s) {
    $GbSrc{taxa} = $2;
    $GbSrc{taxa} =~ tr/\n\r\t/ /;
    $GbSrc{taxa} =~ tr/ / /s;
  } else {
    printf STDERR "%s. WARNING: regexp failed to grab organism, ID %s\n", &MySub, $TmpID;
  }

  # get field Medline
  while ($GbPlain =~ m|\bREFERENCE.+?\n +MEDLINE +(<[aA] .*?>)?(\d+)(</[aA]>)?.*?\n\b|gs) {
    push @{$GbSrc{medline}}, $2;
  }

  # get first informative publication
  # - true journal article if possible (last one wins)
  # - skip "unpublished" (journal field)
  # get submission info ("direct submission")
  # - pick the earliest one
  my %PublData;
  pos($GbPlain) = 0;
  while ($GbPlain =~ m/\bREFERENCE *.*?\n *AUTHORS *(.*?)\n *TITLE *(.*?)\n *JOURNAL *(.*?)\n\w/gs) {
    pos($GbPlain) -= 2;
    %PublData = (authors=>$1, title=>$2, journal=>$3);
    while ($PublData{journal} =~ m/\n\s+(MEDLINE|PUBMED)\s+(<[aA] .*?>)?(\d+)(<\/[aA]>)?/) {
      $PublData{lc $1} = $3;
      $PublData{journal} = $` . $';
    }
    foreach my $k (qw(authors title journal)) {
      $PublData{$k} =~ tr/\n\r\t/ /;
      $PublData{$k} =~ tr/ / /s;
      $PublData{$k} =~ s/(^\s*|\s*$)//g;
    }
    # decide for: submission info (store separate), upublished article (skip),
    # published article (store)
    if ($PublData{title} =~ m/direct submission/i) {
      if ($PublData{journal} =~ m/submitted \((.*?)\)\s*/i) {
        $PublData{date} = $1;
        $PublData{journal} = $` . $';
      }
      push @{$GbSrc{submitall}}, { %PublData };
    }
    elsif ( ($PublData{journal} =~ m/unpublished/io) ) { next }
    else {
      $GbSrc{publ} = { %PublData };
    }
  }
  if (! %PublData) {
    printf STDERR "%s. WARNING: regexp failed to grab references, entry %s\n", &MySub,
      $TmpID;
  }
  if ($GbSrc{submitall}) {
    $GbSrc{submit} = (sort { $a->{date} cmp $b->{date} } @{$GbSrc{submitall}})[0];
    delete $GbSrc{submitall};
  }

  # get SourceInfo from beginning of feature table
  # - matches may contain line breaks (esp. organism)
  if ($GbPlain =~ m/\bFEATURES\s+Location\/Qualifiers\n +source +.+?((\n {6,}(.+))+)/) {
    my $SrcPlain = $1;
    $SrcPlain =~ m/\/organism="([^"]+)"/ and $GbSrc{organism} = $1;
    $GbSrc{organism} =~ s/\s*\n\s+/ /;
    $SrcPlain =~ m/\/organelle="([^"]+)"/ and $GbSrc{organelle} = $1;
    $SrcPlain =~ m/\/db_xref="taxon:(\d+)"/ and $GbSrc{OrganismNum} = $1;
    $SrcPlain =~ m/\/mol_type="([^"]+)"/ and $GbSrc{SeqMolType} = $1;
  } else {
    printf STDERR "%s. WARNING: regexp failed to grab SourceInfo annotation, entry %s\n", &MySub, $TmpID;
    $GbPlain =~ m/\bFEATURES/s;
    printf STDERR "%s%s\n", $&, substr ($', 0, &Min(length($'),250));
  }

  return \%GbSrc;
}


# extract data from NCBI GenBank format - annotation information
#
# INTERFACE
# - argument 1: GenBank plain text
#
# - options:
#   -attrib     add data substructure (array) of attributes
#   -debug      [STD]
#
# - return val: - reference to sequence entry annotations (array @GbAnnot)
#                 Each feature represent hash containing:
#                 type         name of feature
#                 range        feature's complex range string
#                 text         additional feature description
#               - undef if an error occurs
#
sub _Genbank2Annot {

  # function constants
  my $MaxTextLn = 240;
  my $MaxText = 1024;

  # function parameters
  my ($GbPlain,%opt) = @_;
  my $bAttrib = int($opt{-attrib});
  my $debug = $opt{-debug};
  $debug and printf STDERR "%s. entered sub, parsing attributes: %s\n", &MySub,
    $bAttrib?'YES':'NO';

  # get plain text containing all features
  # - the following should be robust also for large annotation blocks, e.g.
  #   the annotated E.coli K-12 genome
  if ($GbPlain !~ m/\nFEATURES +\w+/) {
    printf STDERR "%s. ERROR: RegExp failed to match (annot offset)\n", &MySub;
    return undef;
  }
  $GbPlain = $';
  if ($GbPlain !~ m/\n\w+/) {
    printf STDERR "%s. ERROR: RegExp failed to match (annot end)\n", &MySub;
    return undef;
  }
  $GbPlain = $` . $&;

  # loop over feature entries
  my @GbAnnot;
  while ($GbPlain =~ m/\n {5}(\w+) +(.+?)\n(( {16,}.+\n)*)/g) {
    pos($GbPlain) -= 3;

    # enter annotation entry
    push @GbAnnot, {
      type  => $1,
      range => $2,
      };
    my $AnnotPlain = $3;

    # find all lines containing position string
    while ($GbAnnot[-1]{range} =~ m/,$/) {
      $AnnotPlain =~ m| {16,}(.+?)\n|gs or last;
      pos ($AnnotPlain) -= 3;
      $GbAnnot[-1]{range} .= $1;
    }
    $GbAnnot[-1]{range} =~ s/[ \t\n\r]+//g;

    # enter (short) annotation attributes to annotation text
    # eventually, enter annotation attributes as a data substructure
    my @attrib;
    while ($AnnotPlain =~ m| {16,}/(\w+)\n|g) {
      my $AnnotText = $1;
      $GbAnnot[-1]{text} .= "$AnnotText=TRUE\n";
      if ($bAttrib) { push @attrib, { label=>$AnnotText, val=>'TRUE' }; }
    }
    while ($AnnotPlain =~ m| {16,}/((\w+)="([^"]+)")|g) {  #"
      my $AnnotText = $1;
      if ($bAttrib) {
        my $AnnotLbl = $2;
        $AnnotText = $3;
        $AnnotText =~ s/\s*\n +/ /g;
        push @attrib, { label=>$AnnotLbl, val=>$AnnotText };
      }
      if ($AnnotText =~ m/^(function|transl_table|translation)/) { next }
      $AnnotText =~ s/\s*\n\s+/ /g;

      # attribute values (text fields) may be too long - skip them
      if (
        defined($AnnotText) and length($AnnotText)<=$MaxTextLn and
        length(exists($GbAnnot[-1]{text})?$GbAnnot[-1]{text}:'')+length($AnnotText) < $MaxText
      ) {
        $GbAnnot[-1]{text} .= $AnnotText ."\n";
      }
    }
    if ($bAttrib) { $GbAnnot[-1]{attrib} = [ @attrib ]; }
#    # additionally, enter bulky annotation attributes
#    if ($bAttrib) {
#      while ($AnnotPlain =~ m| {16,}/((\w+)="([^"]+(?:\n +[^"]+)*)")|g) {  #"
#        my $AnnotLbl = $2;
#        my $AnnotText = $3;
#        $bAttrib and push @attrib, { label=>$AnnotLbl, val=>$AnnotText };
#        if ($AnnotText =~ m/^(function|transl_table|translation)/) { next }
#        $AnnotText =~ s/\s*\n\s+/ /g;
#
#        # attribute values (text fields) may be too long - skip them
#        if (
#          defined($AnnotText) and length($AnnotText)<=$MaxTextLn and
#          length(exists($GbAnnot[-1]{text})?$GbAnnot[-1]{text}:'')+length($AnnotText) < $MaxText
#        ) {
#          $GbAnnot[-1]{text} .= $AnnotText ."\n";
#        }
#      }
#    }
  }  # end: feature loop

  return \@GbAnnot;
}


# convert sequence data structure to NCBI GenBank format
#
# INTERFACE
# - argument 1: reference to sequence data structure
#
# - options:
#   -debug      [STD]
#
# - return val: - plain sequence format document
#               - undef if an error occurs
#
# DEVELOPER'S NOTES
# - This is a very primer of an output filter
#
sub SeqentryToGenbank {
  my ($pSeq,%opt) = @_;
  my $debug = $opt{-debug};
  my (@GbLn, $GbPlain);

  # function parameters
  if (length($$pSeq{SeqPure}||=&SeqStrPure($$pSeq{sequence})) != length($$pSeq{sequence})) {
    $pSeq = &SeqPure (&DataClone($pSeq));
  }
  $debug and printf STDERR "%s. entered sub\n", &MySub;

  # start with header
  #                      LOCUS       NC_000957               5228 bp    DNA     linear   BCT 12-JAN-2004
  push @GbLn,   sprintf("LOCUS       %-21s%s%s%s%s%s%s%s%s%-1s%6d bp    DNA     linear   %3s 25-FEB-2004\n",
    $$pSeq{id}, ('') x 9, length($$pSeq{sequence}), $$pSeq{section}||'PLN' );
  # *** in progress ***
  # missing line breaks of long text fields, cmp. script RestricReport.sh
  push @GbLn,   sprintf("DEFINITION  %s\n", $$pSeq{descr});
  push @GbLn,   sprintf("ACCESSION   %s\n", $$pSeq{id});
  push @GbLn,   sprintf("SOURCE      %s\n", $$pSeq{organism}||'???');
  push @GbLn,   sprintf("  ORGANISM  %s\n", $$pSeq{organism}||'???');
  if (exists($$pSeq{taxa}) and $$pSeq{taxa}) {
  push @GbLn,   sprintf("            %s\n", $$pSeq{taxa});
  }
  push @GbLn,   sprintf("FEATURES             Location/Qualifiers\n");

  # add annotations
  my @SrcAnnot = grep { $_->{type} eq 'source' } @{$$pSeq{annot}};
  if (! int(@SrcAnnot)) {
    my $pAnnot = { range=>sprintf('1..%d',length($$pSeq{sequence})) };
    unshift @{$$pSeq{annot}}, $pAnnot;
  }
  foreach my $pAnnot (@{$$pSeq{annot}}) {
    my %annot = %$pAnnot;
    $annot{range} ||= sprintf('%s%d..%d%s', ($annot{orient}<0)?'complement(':'',
      $annot{offset}, $annot{end}, ($annot{orient}<0)?')':'');
    $annot{note} ||= ($annot{text}=~m/^(comment|note)=("?)(.+)\2/m)[2];
    if ($annot{note}) { push @{$annot{attrib}}, 'note' }

    # type-specific attributes
    if ($annot{type} eq 'source') {
      $annot{organism} ||= $$pSeq{organism};
      if ($annot{organism}) { push @{$annot{attrib}}, 'organism' }
      $annot{organelle} ||= $$pSeq{organelle};
      if ($annot{organelle}) { push @{$annot{attrib}}, 'organelle' }
      $annot{db_xref} ||= $$pSeq{OrganismNum} ? "taxon:$$pSeq{OrganismNum}":'';
      if ($annot{country}) { push @{$annot{attrib}}, 'country' }
      if ($annot{db_xref}) { push @{$annot{attrib}}, 'db_xref' }
      $annot{mol_type} ||= $$pSeq{SeqMolType} || $$pSeq{SeqCateg} || $$pSeq{SeqType};
      if ($annot{mol_type}) { push @{$annot{attrib}}, 'mol_type' }
    }
    if ($annot{type} =~ m/^(CDS|gene|mRNA)/) {
      $annot{gene} ||= ($annot{text}=~m/^gene=("?)(.+)\1/m)[1];
      if ($annot{gene}) { push @{$annot{attrib}}, 'gene' }
      $annot{product} ||= ($annot{text}=~m/^product=("?)(.+)\1/m)[1];
      if ($annot{product}) { push @{$annot{attrib}}, 'product' }
    }
    if ($annot{type} eq 'CDS') {
      $annot{transl_table} ||= 1;
      push @{$annot{attrib}}, 'transl_table';
      if ($annot{transl_frame}) { push @{$annot{attrib}}, 'transl_frame' }
      $annot{translation} ||= &TranslNt(
        &SeqCplxRange({id=>'fake',sequence=>$$pSeq{sequence}}, $annot{range})->{sequence},
        -frame=>$annot{transl_frame});
      push @{$annot{attrib}}, 'translation';
    }
    #                         FEATURE         TEXT------------------------------------------------------|
    push @GbLn, sprintf("     %-15s%s%s%s%s%s %s\n", $annot{type}, ('') x 5, $annot{range});
    foreach (@{$annot{attrib}}) {
    push @GbLn, sprintf("                     /%s=\"%s\"\n", $_, $annot{$_});
    }
  }

  # add sequence
  push @GbLn,           "ORIGIN\n";
  my $CtPos = 1;
  my $sSeq = &_SeqStrCluster ($$pSeq{sequence});
  my @ln = split (/$reEndl/o, $sSeq);
  foreach $sSeq (@ln) {
    push @GbLn, sprintf("%9d %s\n",$CtPos,$sSeq);
    $CtPos += 60;
  }
  push @GbLn,           "//\n";

  $GbPlain = join ('', @GbLn);
  return $GbPlain;
}


################################################################################
# sequence string formatting
################################################################################


# format sequence string in block format
#
# INTERFACE
# - argument 1: source sequence string
#
# - options:
#   -blocks     number of blocks per line
#   -chars      number of characters per block
#   -indent     indent each line by given number of spaces
#   -pure       purify sequence string (done by &SeqStrPure)
#               You may specify a sequence type. Then, fuzzy letters are
#               converted to official 'unknowns'.
#   -upper      handed over to &SeqStrPure
#
# - return val: plain formatted sequence
#
# DESCRIPTION
# - sequence will have an \n at the end
# - as default produce clusters of 10 characters (option '-chars'),
#   6 blocks per line (option '-blocks')
#
sub _SeqStrCluster {
  my $CharDefault  = 10;
  my $BlockDefault = 6;
  my ($sSeq,%opt) = @_;
  my ($NumChar, $NumBlock, $indent);

  # function parameters
  $NumChar  = int($opt{-chars})  || $CharDefault;
  $NumBlock = int($opt{-blocks}) || $BlockDefault;

  # purify sequence string
  if ($opt{-pure}) {
    $sSeq = &SeqStrPure ($sSeq, -SeqType=>$opt{-pure}, -upper=>$opt{-upper});
  } else {
    $sSeq =~ tr/ \t\n\r//d;
  }

  # format sequence
  $sSeq =~ s/([^\s]{$NumChar})/$1 /g;
  $sSeq =~ s/(( ?[^\s]+){$NumBlock}) ?/$1\n/g;
  if (exists($opt{-indent}) and $indent = int($opt{-indent})) {
    $sSeq =~ s/(^|\n)(.+)/$1 . (' ' x $indent) . $2/ge;
  }
  $sSeq =~ s/(\r?\n)*$/\n/;

  return $sSeq;
}


################################################################################
# sequence annotation formats
################################################################################


# sort sequence annotations
#
# INTERFACE
# - argument 1+: array of sequence annotations
# - return val:  array of sequence annotations (sorted)
#
# DESCRIPTION
# - for sorting criteria/order see code.
#
sub AnnotSort {
  return sort {
          $$a{offset} <=> $$b{offset} or
             $$a{end} <=> $$b{end} or
   lc($$a{group}||'') cmp lc($$b{group}||'') or
    lc($$a{type}||'') cmp lc($$b{type}||'') or
    -1 } @_;
}


# resolve complex range syntax (GenBank format)
#
# INTERFACE
# - argument 1:  reference to annotation entry (root of list of annotation
#                entries) in sequence data structure referenced data is
#                explicitly changed
#
# - options:
#   -debug       [STD]
#
# - return val:  success status (boolean)
#
sub AnnotRangeRes {
  my ($pAnnotBase,%opt) = @_;
  my $debug = $opt{-debug};

  # loop over annotations
  my $CtGroup=0;
  for (my $CtAnnot=0; $CtAnnot<int(@$pAnnotBase); ++$CtAnnot) {
    my $pAnnot = $$pAnnotBase[$CtAnnot];
    $$pAnnot{range} or next;

    # found complex syntax - translate
    $debug and printf STDERR "%s. annotation: label %s, text %d chars\n", &MySub,
      $$pAnnot{type}, length $$pAnnot{text};
    my $bRevcompl = ($$pAnnot{range} =~ m/complement/);

    # need to split annotation => found a group
    if ($$pAnnot{range}=~m/,/ or $$pAnnot{type} eq 'CDS') {
      ++ $CtGroup;
      while ($$pAnnot{range} =~ m/[<>]?(\d+)\.{2}[<>]?(\d+)/g) {
        push @$pAnnotBase, {
          type   => $$pAnnot{type},
          offset => $1,
          end    => $2,
          text   => $$pAnnot{text},
          group  => $$pAnnot{group} || $CtGroup,
          orient => $bRevcompl ? -1 : 1,
          };
      }

      # remove old entry
      splice @$pAnnotBase, $CtAnnot, 1;
      -- $CtAnnot;
    }

    # single-range annotation
    else {
      if ($$pAnnot{range} =~ m/[<>]?(\d+)\.{2}[<>]?(\d+)/) {
        @{$pAnnot}{'offset','end'} = ($1,$2);
        $$pAnnot{orient} = $bRevcompl? -1 : 1;
      }
      else {
        printf STDERR "%s. syntax ERROR for annotation range: annot #%d, range %s\n", &MySub,
          $CtAnnot, $$pAnnot{range};
        splice @$pAnnotBase, $CtAnnot, 1;
        -- $CtAnnot;
      }
    }
  }

  # re-sort annotation list
  @$pAnnotBase = &AnnotSort(@$pAnnotBase);

  # return successfully
  return $pAnnotBase;
}


# which attribute in GenBank yields grouping information?
$_LibParam{GbFeatGrp} = {
  "3'UTR" => 'gene',
  "5'UTR" => 'gene',
  CDS     => 'gene',
  exon    => 'gene',
  intron  => 'gene',
  mRNA    => 'gene',
  sig_peptide => 'gene',
  transit_peptide => 'gene',
  };

# translate annotation grouping from several formats to common data structure
#
# INTERFACE
# - argument 1:  sequence data structure
#                data structure is explicitly changed
#
# - options:
#   -debug       [STD]
#
# - return val:  success status (boolean)
#
# DESCRIPTION
# - Input requirements:
#   - The input sequence may be gapped and this won't affect the CDS analysis
#     nor will the sequence be changed in this respect.
# - Output - the target annotation data structure format, see documentations:
#   "sequence annotation data structure"
#   "sequence annotation group data structure"
# - coversion strategies:
#   - the main strategy is to resolve complex-range annotations to a group of
#     annotations having condensed ranges.
#   - special conversion routines (code blocks in this function) pay special
#     attention on grouping annotations the right way, eventually renaming
#     some annotation labels to standard values.
#   - for GenBank source format grouping information is retrieved from the
#     annotation text which contains annotation attributes which give hints
#     on the relations between different annotation entries.
#
# DEBUG, CHANGES, ADDITIONS
# - GenBank annotation remodelling:
#   - turn CDS annotations into gene groups, containing "CDS" feature,
#     referring to "CDS_exon" elements
# - GFF CDS annotation remodelling:
#   - if annotation type "CDS_exon" exists do not turn annotation type "Exon "
#     into CDS group.
#
sub AnnotExpandTscr {
  my ($pSeq,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;

  my (@AnnotSlc, $pAnnot, $pAnnotSub);
  my ($CtGrp, $pGrp, $pGrpIdx, $CtGrpSub);
  my ($pCds, $pSeqCds, $pCdsOrf, $CdsPos, $CdsPosG);
  my ($CtI);

  ##############################################################################
  # transcript / CDS from Experiment, Dictyostelium project-specific
  # - the source object of this function is an Experiment file format with a
  #   DictyProject-specific syntax for the highlighting of the gene structure
  #   The annotation philosophy is:
  #   - an annotation (label "EXON") overlaps the complete range of
  #     CDS elements of a gene. Attributes in the text field may be: gene,
  #     gene_name, ...
  #   - additional annotations (label "ENZ8") mark splice sites
  #   - from these annotations the whole CDS_exon structure can be computed.
  # - over all, when scanning for gene annotations, we check for
  #   - tRNA               => skip transcript
  #   - 'non-coding'       => skip CDS range sampling
  #   - inconsistent ORF   => skip CDS range sampling
  #   - inconsistent start => WARNING
  # - the aim is to compute an array of following annotation types:
  #   - SpliceDonor, SpliceAcceptor: These annotations need to be provided
  #     in full consistency
  #   - CDS_exon: true-coding parts computed from the splice site annotations
  #   - start, stop: these annotations can be computed following the
  #     largest-ORF rule
  #   - annotation groups having entries "CDS", themselve referring to
  #     annotations of type "CDS_exon"
  if ($$pSeq{SrcFmt} eq 'Experiment' and
     (@AnnotSlc = grep { $_->{layer} eq 'cons' and $_->{type} eq 'EXON' } @{$$pSeq{annot}})) {

    AnnotExpandTscrTssExperDictyExon:
    foreach $pAnnot (@AnnotSlc) {
      # found an EXON tag => trapped a gene

      # create new group for encountered transcribed sequence
      $pGrp = { %$pAnnot };
      $$pGrp{type} = 'TCS';
      $$pGrp{id} = ($$pGrp{text}=~m/^(gene|id)=(\w+)/m) ? $2 : ++$CtGrp;
      if (exists $$pSeq{AnnotGrp}{$$pGrp{id}}) {
        $$pGrp{id} .= '_'. (++ $CtGrp);
      }

      # transcript is tRNA? => skip it
      if ($$pAnnot{text} =~ m/\b[A-Z][a-z]{2}-[ACGTU]{3}\b/s) {
        $debug and printf STDERR "%s(ExperDicty). transcript group %s is tRNA\n", &MySub, $$pGrp{id};
        next AnnotExpandTscrTssExperDictyExon;
      }

      # enter transcript group into sequence data structure
      if (exists ($$pSeq{AnnotGrp}{$$pGrp{id}}) and %{$$pSeq{AnnotGrp}{$$pGrp{id}}}) {
        printf STDERR "%s(ExperDicty). WARNING: new transcript group %s clashes with existing group\n", &MySub, $$pGrp{id};
        printf STDERR "  seq %s, existing group at %d..%d %s, new group at %d..%d %s\n", $$pSeq{id}||"''",
          $$pSeq{AnnotGrp}{$$pGrp{id}}{offset}, $$pSeq{AnnotGrp}{$$pGrp{id}}{end}, &SignChar($$pSeq{AnnotGrp}{$$pGrp{id}}{orient},-allow0=>1),
          $$pGrp{offset}, $$pGrp{end}, &SignChar($$pGrp{orient},-allow0=>1);
      } elsif ($debug) {
        $debug and printf STDERR "%s(ExperDicty). creating transcript group %s, seq %s, at %d..%d %s\n", &MySub,
          $$pGrp{id}, $$pSeq{id}||"''",
          $$pGrp{offset}, $$pGrp{end}, &SignChar($$pGrp{orient},-allow0=>1);
      }
      $$pSeq{AnnotGrp}{$$pGrp{id}} = $pGrp;

      # enter CDS data sub-structure into group data structure
      # we may delete this later if the transcript turns out to be non-coding
      $pCds = $$pGrp{CDS} = {};
      @{$pCds}{'offset','end'} = @{$pGrp}{'offset','end'};

      # sample signal entries inside transcript range
      foreach $pAnnotSub (grep { $_->{type} eq 'ENZ8' and
        $_->{offset} >= $$pAnnot{offset} and $_->{end} <= $$pAnnot{end}
        } grep { $_->{layer} eq 'cons' } @{$$pSeq{annot}}
      ) {

        # skip signal that shows grouping elsewhere
        if ($$pAnnotSub{text} =~ m/^group=(.+)$/m) {
          $$pAnnotSub{group} = $1;
          if ($$pAnnotSub{group} ne $$pGrp{id}) {
            printf STDERR "%s(ExperDicty). skipped grouped signal that's grouped elsewhere - seq %s\n", &MySub, $$pSeq{id};
            printf STDERR "  transcript %s at %d..%d %s, signal %s (group %s) at %d..%d %s\n",
              $$pGrp{id}, $$pGrp{offset}, $$pGrp{end}, &SignChar($$pGrp{orient},-allow0=>1),
              $$pAnnotSub{type}, $$pAnnotSub{group},
              $$pAnnotSub{offset}, $$pAnnotSub{end}, &SignChar($$pAnnotSub{orient},-allow0=>1);
            next;
          }
        }

        # refine signal syntax
        $$pAnnotSub{text} =~ m/^\w+/;
        $$pAnnotSub{type} = $& || '';
        $$pAnnotSub{type} =~ s/^SD$/SpliceDonor/;
        $$pAnnotSub{type} =~ s/^SA$/SpliceAccept/;
        $$pAnnotSub{type} =~ s/^start$/TranslStart/;
        $$pAnnotSub{type} =~ s/^stop$/TranslStop/;
        if (! $$pAnnotSub{type} or $$pAnnotSub{type} =~ m/^(intron|xxx)$/) {
          printf STDERR "%s(ExperDicty). WARNING: bad gene tagging syntax! - seq %s, feature %s\n", &MySub, $$pSeq{id}, $$pAnnotSub{type};
          printf STDERR "  Usage of %s is highly recommended\n", $CorePath{call}{DATagReduce};
          next AnnotExpandTscrTssExperDictyExon;
        }

        # warn for mis-oriented signals, adapt orientation
        if ($$pAnnotSub{orient} != $$pGrp{orient}) {
          printf STDERR "%s(ExperDicty). WARNING: conflicting transcript/signal orientation - seq %s\n", &MySub, $$pSeq{id};
          printf STDERR "  transcript %s at %d..%d %s, signal %s at %d..%d %s\n",
            $$pGrp{id}, $$pGrp{offset}, $$pGrp{end}, &SignChar($$pGrp{orient},-allow0=>1),
            $$pAnnotSub{type},
            $$pAnnotSub{offset}, $$pAnnotSub{end}, &SignChar($$pAnnotSub{orient},-allow0=>1);
          $$pAnnotSub{orient} = $$pGrp{orient};
          print  STDERR "  orientation changed\n",
        }

        # enter signal as a member into group data structure
        $debug and printf STDERR "%s(ExperDicty). found feature %s, at %d..%d %s\n", &MySub, $$pAnnotSub{type},
          $$pAnnotSub{offset}, $$pAnnotSub{end}, &SignChar($$pAnnotSub{orient},-allow0=>1);
        push @{$$pGrp{member}}, $pAnnotSub;

        # process CDS-relevant signals
        if ($$pAnnotSub{type} =~ m/^(TranslStart|SpliceDonor|SpliceAccept|TranslStop)$/) {

          # enter CDS start/stop
          if ($$pAnnotSub{type} eq 'TranslStart' and not $$pCds{start}) {
            $$pCds{start} = ($$pGrp{orient} > 0) ? $$pAnnotSub{offset} : $$pAnnotSub{end};
            $$pCds{($$pGrp{orient} > 0) ? 'offset':'end'} = $$pCds{start};
          }
          if ($$pAnnotSub{type} eq 'TranslStop' and not $$pCds{stop}) {
            $$pCds{stop} = ($$pGrp{orient} > 0) ? $$pAnnotSub{end} : $$pAnnotSub{offset};
            $$pCds{($$pGrp{orient} > 0) ? 'end':'offset'} = $$pCds{stop};
          }

          # enter splice signals (checked later for CDS-relevance)
          if ($$pAnnotSub{type} eq 'SpliceDonor') {
            push @{$$pCds{splice}}, ($$pGrp{orient} > 0) ? $$pAnnotSub{offset}-1 : $$pAnnotSub{end}+1;
          }
          if ($$pAnnotSub{type} eq 'SpliceAccept') {
            push @{$$pCds{splice}}, ($$pGrp{orient} > 0) ? $$pAnnotSub{end}+1 : $$pAnnotSub{offset}-1;
          }
        }
      }

      # transcript is non-protein? => skip CDS range sampling
      if ($$pAnnot{text} =~ m/(^|\n)gene_type=non-protein/s) {
        $debug and printf STDERR "%s(ExperDicty). transcript group %s is non-protein coding\n", &MySub, $$pGrp{id};
        delete $$pGrp{CDS};
        next AnnotExpandTscrTssExperDictyExon;
      }

      # redo block (mono-loop) for final assignment of CDS boundaries: start, stop
      # derive CDS_exon ranges, complex range
      AnnotExpandTscrTssExperDictyStart: {

        # sample splice signals inside CDS range, check consistency
        while (@{$$pCds{splice}} and  $$pCds{splice}[0] < $$pCds{offset}) {
          shift @{$$pCds{splice}};
        }
        while (@{$$pCds{splice}} and $$pCds{splice}[-1] > $$pCds{end}) {
          pop @{$$pCds{splice}};
        }
        if (int (@{$$pCds{splice}}) % 2) {
          printf STDERR "%s(ExperDicty). ERROR: bad gene tagging! Odd number (%d) of splice signals, seq %s, at %d..%d %s\n", &MySub,
            int (@{$$pCds{splice}}), $$pSeq{id}||"''",
            $$pAnnot{offset}, $$pAnnot{end}, &SignChar($$pAnnot{orient},-allow0=>1);
          delete $$pGrp{CDS};
          next AnnotExpandTscrTssExperDictyExon;
        }

        # derive complex range field
        $$pCds{pos} = [ $$pCds{offset}, @{$$pCds{splice}}, $$pCds{end} ];
        delete $$pCds{range};
        while (@_ = splice (@{$$pCds{pos}}, 0, 2)) {
          $$pCds{range} .= ($$pCds{range} ? ',':'') . $_[0] .'..'. $_[1];
        }
        if ($$pCds{range} =~ m/,/) { $$pCds{range} = "join($$pCds{range})"; }
        if ($$pGrp{orient} < 0) {    $$pCds{range} = "complement($$pCds{range})"; }
        delete $$pCds{pos};

        # missing start/stop
        unless ($$pCds{start} and $$pCds{stop}) {
          $debug and printf STDERR "%s(ExperDicty). missing %s annotation\n", &MySub, join (' & ', grep { ! $$pCds{$_} } ('start', 'stop'));
          $debug and printf STDERR "  CDS range: %s\n", $$pCds{range};

          # cut out CDS sequence, translate
          $pSeqCds = &SeqCplxRange ($pSeq, $$pCds{range}, -TrackPos=>1, -debug=>$dbg2);
          unless ($pSeqCds and $$pSeqCds{sequence}) {
            printf STDERR "%s(ExperDicty). ERROR: unable to get CDS sequence, seq %s, range %s - skipped\n", &MySub,
              $$pSeq{id}||"''", $$pCds{range};
            next AnnotExpandTscrTssExperDictyExon;
          }
          $debug and printf STDERR "%s(ExperDicty). CDS sequence: %d bp\n%s\n", &MySub,
            length $$pSeqCds{sequence}, $$pSeqCds{sequence};
          $pCdsOrf = &TranslOrfMax ($$pSeqCds{sequence}, -SlcFrame=>[1,2,3], -SlcStart=>$$pCds{start}?'\w':undef, -debug=>$dbg2);
          unless ($pCdsOrf and $$pCdsOrf{offset}) {
            printf STDERR "%s(ExperDicty). WARNING: missing max.size ORF for seq %s, range %s - skipped\n", &MySub,
              $$pSeq{id}||"''", $$pCds{range};
            printf STDERR "  CDS sequence: %d bp\n%s\n", length $$pSeqCds{sequence}, $$pSeqCds{sequence};
            delete $$pGrp{CDS};
            next AnnotExpandTscrTssExperDictyExon;
          }
          $debug and printf STDERR "%s(ExperDicty). longest ORF in gene - data:\n%s", &MySub;
          $debug and &DataPrint($pCdsOrf,-handle=>\*STDERR,-space=>2,-NoMyRef=>1);
          if ($$pCdsOrf{LengthNt} < 0.5 * length($$pSeqCds{sequence})) {
            printf STDERR "%s(ExperDicty). WARNING: possible ORF range is only %.2f of gene length, seq %s, range %s - skipped\n", &MySub,
              $$pCdsOrf{LengthNt} / length($$pSeqCds{sequence}), $$pSeq{id}||"''", $$pCds{range};
            delete $$pGrp{CDS};
            next AnnotExpandTscrTssExperDictyExon;
          }

          # assign start/stop and redo
          unless ($$pCds{start}) {
            $CdsPos = $$pCdsOrf{offset} - 1;                  # string position of first nt of start codon in CDS sequence
            $$pSeqCds{sequence} =~ m/^-*(\w-*){$CdsPos}/;     # translate to gapped CDS sequence string position
            $$pCds{start} = $$pSeqCds{PosArr}[length($&)];  # translate to contig position
            $debug and printf STDERR "%s(ExperDicty). assigning start: CDS pos. %d (gapped %d) -> pos. seq %d\n", &MySub,
              $$pCdsOrf{offset}, length($&) + 1, $$pSeqCds{PosArr}[length($&)];
            $$pCds{($$pGrp{orient} > 0) ? 'offset':'end'} = $$pCds{start};
          }
          unless ($$pCds{stop}) {
            $CdsPos = $$pCdsOrf{end} + 3 - 1;                 # string position of last nt of stop codon in CDS sequence
                                                                # translate to gapped CDS sequence string position
            if ($$pSeqCds{sequence} !~ m/^-*(\w-*){$CdsPos}/ and $CdsPosG = length ($&)) {
              $CdsPos -= 3;
              $debug and printf STDERR "%s(ExperDicty). assuming truncated CDS, trying pos %d\n", &MySub, $CdsPos;
              $$pSeqCds{sequence} =~ m/^-*(\w-*){$CdsPos}/ and $CdsPosG = length ($&);
            } else {
              $debug and printf STDERR "%s(ExperDicty). pattern match to grab CDS failed, trying pos %d\n", &MySub, $CdsPos;
              $CdsPosG = $CdsPos;
            }
            $$pCds{stop} = $$pSeqCds{PosArr}[$CdsPosG];     # translate to contig position
            $debug and printf STDERR "%s(ExperDicty). assigning stop: CDS pos. %d (gapped %d) -> seq pos. %d\n", &MySub,
              $$pCdsOrf{end}, $CdsPosG + 1, $$pSeqCds{PosArr}[$CdsPosG];
            $$pCds{($$pGrp{orient} > 0) ? 'end':'offset'} = $$pCds{stop};
          }
          redo AnnotExpandTscrTssExperDictyStart;
        }

        # have start/stop
        # testing will be done in &CdsStruct
        $debug and printf STDERR "%s(ExperDicty). got CDS: seq %s, range %s\n", &MySub,
          $$pSeq{id}||"''", $$pCds{range};

        # refine CDS data structure, create set of CDS_exon annotations
        $$pCds{type} = 'CDS';
        delete $$pCds{splice};
        $$pCds{member} = [ { %$pCds } ];
        $$pCds{group} = $$pCds{member}[0]{group} = $$pGrp{id};
        $$pCds{member}[0]{type} = 'CDS_exon';
        &AnnotRangeRes ($$pCds{member}, -debug=>$dbg2);
        push @{$$pSeq{annot}}, @{$$pCds{member}};
        push @{$$pGrp{member}}, @{$$pCds{member}};
        delete $$pCds{member};
        # sort members (annotations) in group
        @{$$pGrp{member}} = &AnnotSort (@{$$pGrp{member}});
      }
    }  # end AnnotExpandTscrTssExperDictyStart
  }  # end transcript/gene conversion from Experiment Dicty

  ##############################################################################
  # transcript / CDS from GFF
  # - Input format support (GFF, any version) is optimal for GeneID gene
  #   prediction output.
  # - The input sequence may be gapped and this won't affect the CDS analysis
  #   nor will the sequence be changed in this respect.
  # - the target annotation syntax is that of GenBank file format
  # - currently, geneid-specific exon labelling syntax is kept
  if ($$pSeq{SrcFmt} eq 'GFF') {
    # redo block (mono-loop)
    AnnotExpandTscrTssGff: {
      $debug and printf STDERR "%s(GFF). seq %s\n", &MySub, $$pSeq{id}||"''";
      my ($pGrpRaw, %exon);
      # $pGrpIdx will be:
      # - hash reference:
      #   - each value represents a transcribed sequence => reference to array
      #     of contributing elementary annotations
      #   - entries are indexed by group label
      # - indexing array references: array of annotations belonging to that
      #   transcribed sequence (forming group)

      # group gene-related annotations ('features' according to GFF syntax)
      # - annotations are expected here to be in standard order
      foreach $pAnnot (@{$$pSeq{annot}}) {
        $$pAnnot{type} =~ m/^(cds(_exon)?|exon(_cds)?|first|internal|single|terminal)$/i or next;
        $debug and printf STDERR "%s. got exon annotation,"
          ." seq %s, group %s, range %d..%d\n", &MySub, $$pSeq{id}||"''",
          $$pAnnot{group}||"''", @{$pAnnot}{'offset','end'};
        push @{$$pGrpIdx{$$pAnnot{group}}}, $pAnnot;
      }
      # anything to do?
      if (int(keys %$pGrpIdx) == 0 or
         (int(keys %$pGrpIdx) == 1 and (keys %$pGrpIdx)[0] eq '.')
        ) {
        last AnnotExpandTscrTssGff;
      }

      # loop over groups (CDSs)
      # sorted according to offset of first member
      foreach $pGrpRaw (sort{ $a->[0]{offset}<=>$b->[0]{offset} } values %$pGrpIdx) {

        # preliminary start/stop in geneid prediction?
        # -> extend the gene range to the next frame borders
        # -  we need the exons sorted here (as done in the foreach expression)
        $exon{first} = ($$pGrpRaw[0]{orient} > 0) ? $$pGrpRaw[0] : $$pGrpRaw[-1];
        if ($exon{first}{type} =~ m/^(internal|terminal)$/i) {  # this is geneid syntax
          if ($$pGrpRaw[0]{orient} < 0) {
            $debug and printf STDERR "%s. shifting gene offset to frame phase 0: move %d\n", &MySub,
              -$exon{first}{frame};
            $exon{first}{end}    -= $exon{first}{frame};
            $exon{first}{endblur} = 1;
          } else {
            $debug and printf STDERR "%s. shifting gene offset to frame phase 0: move %d\n", &MySub,
              $exon{first}{frame};
            $exon{first}{offset} += $exon{first}{frame};
            $exon{first}{offblur} = 1;
          }
        }
        $exon{last} = ($$pGrpRaw[0]{orient} > 0) ? $$pGrpRaw[-1] : $$pGrpRaw[0];
        if ($exon{last}{type} =~ m/^(internal|first)$/i) {  # this is geneid syntax
          # what comes here?
          # we have to compute the frame phase at the end of the annotated CDS
        }

        # join features to CDS
        # annotations are expected here to be in standard order
        $$pSeq{AnnotGrp}{$$pGrpRaw[0]{group}} = {
          type   => 'TCS',
          offset => ($$pGrpRaw[0]{orient} > 0) ? $exon{first}{offset} : $exon{last}{end},
          end    => ($$pGrpRaw[0]{orient} > 0) ? $exon{last}{end} : $exon{first}{offset},
          orient => $$pGrpRaw[0]{orient},
          group  => $$pGrpRaw[0]{group},
          CDS => {
            type   => 'CDS',
            range  => sprintf ('join(%s)', join (',', map { "$_->{offset}..$_->{end}" } @$pGrpRaw)),
            member => [ grep { $_->{group} eq $$pGrpRaw[0]{group} } @{$$pSeq{annot}} ],
            text   => "gene=$$pGrpRaw[0]{group}",
            group  => $$pGrpRaw[0]{group},
            },
          };
        if ($$pGrpRaw[0]{orient} < 0) {
          $$pSeq{AnnotGrp}{$$pGrpRaw[0]{group}}{CDS}{range} = sprintf
            "complement(%s)", $$pSeq{AnnotGrp}{$$pGrpRaw[0]{group}}{CDS}{range};
        }

        # rename annotation labels
        foreach $pAnnot (@$pGrpRaw) {
          $$pAnnot{type} = 'CDS_exon';
        }
      }

      # generate intron annotations if missing
      if (! grep { $_->{type} eq 'intron' } @{$$pSeq{annot}} and
          grep { $_->{type} eq 'TCS' } values %{$$pSeq{AnnotGrp}}) {
        foreach $pCds (map { $_->{CDS} } grep { $_->{type} eq 'TCS' } values %{$$pSeq{AnnotGrp}}) {
          for ($CtI=1; $CtI<@{$$pCds{member}}; $CtI++) {
            $pAnnot = { %{$$pCds{member}[$CtI]} };
            $$pAnnot{type} = 'intron';
            $$pAnnot{group} = $$pCds{group};
            $$pAnnot{offset} = $$pCds{member}[$CtI-1]{end} + 1;
            $$pAnnot{end} = $$pCds{member}[$CtI]{offset} - 1;
            push @{$$pSeq{annot}}, $pAnnot;
          }
        }

        # re-sorting of annotation list is done globally via &AnnotRangeRes
      }
    }
  }  # end transcript/gene conversion from GFF

  ##############################################################################
  # group label for gene features coming from GenBank format
  if ($$pSeq{SrcFmt} eq 'GenBank') {

    foreach $pAnnot (@{$$pSeq{annot}}) {

      # try to conserve existing group labels
      # we have an index table for how to get a group label from annotation
      #   attributes depending on the annotation type
      if (exists $_LibParam{GbFeatGrp}{$$pAnnot{type}} and
          $$pAnnot{text} =~ m/^$_LibParam{GbFeatGrp}{$$pAnnot{type}}=(.+)/
      ) {
        $$pAnnot{group} = $1;
        $$pAnnot{group} =~ s/^"//; $$pAnnot{group} =~ s/"$//;
        $$pAnnot{group} = (split (/\s+/, $$pAnnot{group}))[0];
      }
    }

    # loop over annotations which will be split into a group of annotations
    # - annotation CDS
    foreach $pAnnot (grep {
      exists($_->{group}) and $_->{group} and
      exists($_->{range}) and ($_->{range}=~m/,/ or $_->{type} eq 'CDS')
    } @{$$pSeq{annot}}) {
      if (exists($$pAnnot{GroupSub}) and $$pAnnot{GroupSub}) { next }

      # introduce sublabel if there're multiple occurrences of one annotation
      # type with the same label, having complex range
      # - sub-labelling is not ... [hm? forgot, what i wanted to say here]
      # - we assume that field "GroupSub" was not used before
      @AnnotSlc = grep {
        $_ ne $pAnnot and
        exists($_->{group}) and $_->{group} eq $$pAnnot{group} and
        $_->{type} eq $$pAnnot{type}
      } @{$$pSeq{annot}};
      if (@AnnotSlc) {
        $CtGrpSub = 0;
        foreach $pAnnotSub (&AnnotSort (@AnnotSlc, $pAnnot)) {
          $$pAnnotSub{GroupSub} = ++ $CtGrpSub;
        }
      }
    }

    # finish labels of complex groups
    foreach (grep { exists $_->{GroupSub} } @{$$pSeq{annot}}) {
      $_->{GroupSub} and $_->{group} .= '.'. $_->{GroupSub};
      delete $_->{GroupSub};
    }

    # resolve annotations which still have complex range information
    # annotations are re-sorted there
    &AnnotRangeRes ($$pSeq{annot}, -debug=>$dbg2);

    # create group for CDS annotation
    # we set the labels following the same procedure like in &AnnotRangeRes.
    #   The according codes need to have the same results!
    foreach $pAnnot (grep { $_->{type} eq 'CDS' } @{$$pSeq{annot}}) {
      unless ($$pAnnot{group}) {
        $pGrpIdx = { map { ($_=>1) } map { @$_ }
          &DataTreeSlc ($$pSeq{annot}, [[0,'all'],['group']], -unique=>1) || []
          };
        $$pAnnot{group} = 1;
        while ($$pGrpIdx{$$pAnnot{group}}) { $$pAnnot{group} ++; }
      }
      $$pSeq{AnnotGrp}{$$pAnnot{group}} ||= {
        type   => 'TCS',
        group  => $$pAnnot{group},
        CDS => {
          type   => 'CDS',
          group  => $$pAnnot{group},
          text   => sprintf ("gene=%s", $$pAnnot{group}||$CtGrp),
          member => [],
          },
        };
      push @{$$pSeq{AnnotGrp}{$$pAnnot{group}}{CDS}{member}}, $pAnnot;
    }

    # rename label for exon annotations
    # complete transcript groups
    foreach $pGrp (grep { $_->{type} eq 'TCS' } values %{$$pSeq{AnnotGrp}}) {
      @AnnotSlc = grep {
        exists($_->{group}) and $_->{group} eq $$pGrp{group} and $_->{type} eq 'CDS'
        } @{$$pSeq{annot}};
      foreach $pAnnot (@AnnotSlc) {
        $$pAnnot{type} = 'CDS_exon';
      }
      $$pGrp{offset} = &Min (map { $_->{offset} } @AnnotSlc);
      $$pGrp{end} = &Max (map { $_->{end} } @AnnotSlc);
      $$pGrp{orient} = $AnnotSlc[0]{orient};
      $$pGrp{CDS}{member} = [ &AnnotSort (@AnnotSlc) ];
      $$pGrp{CDS}{range} = sprintf (
        ($$pGrp{orient} > 0) ? 'join(%s)':'complement(join(%s))',
        join (',', map { "$_->{offset}..$_->{end}" } @AnnotSlc));
    }

    # generate intron annotations if missing
    if (! grep { $_->{type} eq 'intron' } @{$$pSeq{annot}} and
        grep { $_->{type} eq 'TCS' } values %{$$pSeq{AnnotGrp}}) {
      foreach $pCds (map { $_->{CDS} } grep { $_->{type} eq 'TCS' } values %{$$pSeq{AnnotGrp}}) {
        for ($CtI=1; $CtI<@{$$pCds{member}}; $CtI++) {
          $pAnnot = { %{$$pCds{member}[$CtI]} };
          $$pAnnot{type} = 'intron';
          $$pAnnot{group} = $$pCds{group};
            # changed 20030612
          $$pAnnot{offset} = $$pCds{member}[$CtI-1]{end} + 1;
          $$pAnnot{end} = $$pCds{member}[$CtI]{offset} - 1;
          push @{$$pSeq{annot}}, $pAnnot;
        }
      }

      # re-sorting of annotation list is done globally via &AnnotRangeRes
    }
  }

  ##############################################################################
  # global post-work

  # resolve annotations which still have complex range information
  # annotations are re-sorted there
  &AnnotRangeRes ($$pSeq{annot}, -debug=>$dbg2);

  # assign group labels to grouped annotations
  # The group feature data structure is imcompletely implemented
  foreach $pGrp (values %{$$pSeq{AnnotGrp}}) {
    foreach $pAnnot (@{$$pGrp{member}}) {
      $$pAnnot{group} ||= $$pGrp{id};
    }
  }

  # return successfully
  return 1;
}


# retrieve splice site annotations from several source annotation formats
#
# INTERFACE
# - argument 1:  sequence data structure
#                data structure is explicitly changed
#
# - options:
#   -debug       [STD]
#
# - return val:  success status (boolean)
#
sub AnnotExpandSplice {
  my ($pSeq,%opt) = @_;
  my $debug = $opt{-debug};
  my (@AnnotSlc, $pAnnot, $pAnnotNew, $CtAnnot);
  my (%GrpIdx, $ItGrp, $pGrp);

  # need to do anything?
  if (grep /^Splice(Donor|Accept)/, @{$$pSeq{annot}}) { return 1 }

  # need nice gene annotation
  $$pSeq{AnnotGrp} or &AnnotExpandTscr ($pSeq);

  # sample intron annotations
  unless (@AnnotSlc = grep { $_->{type} eq 'intron' } @{$$pSeq{annot}}) {

    # no intron annotations
    # sample exon annotations from transcript sequence groups (label "TCS")
    if (@AnnotSlc = map { (exists $_->{CDS}) ? $_->{CDS} : () } values %{$$pSeq{AnnotGrp}}) {
      foreach $ItGrp (@AnnotSlc) {
        $GrpIdx{$$ItGrp{group}} = [
          sort { $a->{offset} <=> $b->{offset} }
          grep { $_->{type} =~ m/^((cds_)?exon(_cds)?|first|internal|single|terminal)$/i }
          @{$$ItGrp{member}} ];
      }
      if ($debug) {
        printf STDERR "%s. contructing splice annotations from 'TCS' groups\n", &MySub;
        printf STDERR "  groups: %d (%d)\n", int @AnnotSlc, int keys %GrpIdx;
        printf STDERR "  total annotations: %d\n", int (map { @$_ } values %GrpIdx);
        printf STDERR "  example annotation type: %s\n", $AnnotSlc[0]{member}[0]{type}||"''";
      }
    }

    # sample flat (uncombined) exon annotations
    # exons need to have group labels!
    else {
      $debug and printf STDERR "%s. contructing splice annotations from CDS_exon/exon annotations\n", &MySub;
      unless (@AnnotSlc = grep { $_->{group} and $_->{type} eq 'CDS_exon' } @{$$pSeq{annot}}) {
        @AnnotSlc =
          grep { $_->{group} and $_->{type} =~ m/^((cds_)?exon(_cds)?|first|internal|single|terminal)$/i }
          @{$$pSeq{annot}};
      }
      foreach $pAnnot (@AnnotSlc) {
        $GrpIdx{$$pAnnot{group}} ||= [];
        push @{$GrpIdx{$$pAnnot{group}}}, $pAnnot;
      }
      foreach $pGrp (values %GrpIdx) {
        @$pGrp = sort { $a->{offset} <=> $b->{offset} } @$pGrp;
      }
    }

    # create intron annotations from grouped exon annotations
    @AnnotSlc = ();
    # $pGrp iterates as array reference on grouped exon annotations
    foreach $pGrp (values %GrpIdx) {
      for ($CtAnnot=1; $CtAnnot<@$pGrp; $CtAnnot++) {
        $pAnnot = { %{$$pGrp[$CtAnnot]} };
        $$pAnnot{type} = 'intron';
        $$pAnnot{end} = $$pAnnot{offset} - 1;
        $$pAnnot{offset} = $$pGrp[$CtAnnot-1]{end} + 1;
        push @AnnotSlc, $pAnnot;
      }
    }
    push @{$$pSeq{annot}}, @AnnotSlc;
  }

  # create splice site annotations from intron annotations
  foreach $pAnnot (@AnnotSlc) {
    $pAnnotNew = { %$pAnnot };
    $$pAnnotNew{type} = ($$pAnnot{orient} > 0) ? 'SpliceDonor' : 'SpliceAccept';
    $$pAnnotNew{end} = $$pAnnot{offset} + 1;
    push @{$$pSeq{annot}}, $pAnnotNew;
    $pAnnotNew = { %$pAnnot };
    $$pAnnotNew{type} = ($$pAnnot{orient} > 0) ? 'SpliceAccept' : 'SpliceDonor';
    $$pAnnotNew{offset} = $$pAnnot{end} - 1;
    push @{$$pSeq{annot}}, $pAnnotNew;
  }

  # re-sort annotations
  @{$$pSeq{annot}} = &AnnotSort (@{$$pSeq{annot}});

  # return successfully
  return $pSeq;
}


$_LibParam{AnnotClipDfltLbl} = [ qw(COLI NNNN MASK REPT SPSQ) ];
$_LibParam{AnnotClipDfltSpace} = 2;

# clip off flanking ranges of sequence annotations
#
# INTERFACE
# - argument 1: reference to sequence data structure
#               referenced data is explicitly changed
#
# - options:
#   -AnnotLbl   reference to array of annotation labels to be clipped
#   -debug      [STD]
#   -space      this size of non-annotated range at the margins may be clipped
#               if an annotation follows
#
# DESCRIPTION
# - annotations of the referenced type are removed if present at the left or
#   right margins of the sequence string.
#
sub AnnotClip {
  my ($pSeq,%opt) = @_;
  my $debug = $opt{-debug};

  # initialize working parameters
  my %ClipAnnotIdx = map { ($_, 1) } $opt{-AnnotLbl} ?
    @{$opt{-AnnotLbl}} : @{$_LibParam{AnnotClipDfltLbl}};
  my $ClipSpace = $opt{-space} || $_LibParam{AnnotClipDfltSpace};
  my $iSeqLen = length ($$pSeq{sequence});

  # array of annotations that have of selected label and are adjacent to current sequence range
  my (@AnnotClip);
  foreach (grep { $ClipAnnotIdx{$_->{type}} } @{$$pSeq{annot}}) {
    if ($_->{offset} <= $iSeqLen and $_->{end} >= 1) {
      $debug and printf STDERR "%s. enter annotation to working list: sequence %s, SeqLen %d, type %s, offset %d, end %d\n", &MySub,
        $$pSeq{id}||"''", $iSeqLen, $_->{type}, $_->{offset}, $_->{end};
      push @AnnotClip, $_;
    }
  }

  # starting margin positions (biological counting)
  my $ClipLeft  = 1 + $ClipSpace;
  my $ClipRight = $iSeqLen - $ClipSpace;

  # loop over selected annotations
  for (my $CtI=0; $CtI<@AnnotClip; $CtI++) {
    if ($AnnotClip[$CtI]{offset} <= $ClipLeft) {
      $debug and printf STDERR "%s. clipping off annotated sequence position %d (left)\n", &MySub, $AnnotClip[$CtI]{offset};
      substr ($$pSeq{sequence}, 0, $AnnotClip[$CtI]{end}) =~ s/./-/g;
      $ClipLeft = &Max ($ClipLeft, $AnnotClip[$CtI]{end} + 1 + $ClipSpace);
      splice @AnnotClip, $CtI, 1;
      $CtI = -1; next;
    }
    if ($AnnotClip[$CtI]{end} >= $ClipRight) {
      $debug and printf STDERR "%s. clipping off annotated sequence position %d (right)\n", &MySub, $AnnotClip[$CtI]{offset};
      substr ($$pSeq{sequence}, $AnnotClip[$CtI]{offset}-1, $iSeqLen-$AnnotClip[$CtI]{offset}+1) =~ s/./-/g;
      $ClipRight = &Min ($ClipRight, $AnnotClip[$CtI]{offset} - 1 - $ClipSpace);
      splice @AnnotClip, $CtI, 1;
      $CtI = -1; next;
    }
  }
}


1;
# $Id: SeqFormat.pm,v 1.52 2018/06/05 18:02:56 szafrans Exp $
