################################################################################
#
#  kPerl AlnK Alignment Suite
#  Standard Library for Align.pl
#
#  copyright (c)
#  Karol Szafranski on behalf of IMB Jena, Genome Analysis, 1998-2002,2004,
#    szafrans@imb-jena.de
#  Karol Szafranski on behalf of FLI Jena, Genome Analysis Group, 2006,
#    szafrans@fli-leibniz.de
#
################################################################################
#
#  DESCRIPTION
#
# - individual description of functions at the beginning of the code blocks
#
################################################################################
#
#  FUNCTIONS, DATA
#
#   @EXPORT
#   %LibGlob
#   %_LibParam
#
# - basics
#   alignment project data structure
#   @_LibParam{qw(Version ParamDflt Phrase)}
#
# - sequence reference I/O
#   %SeqOrig
#   $_poSeqIndex
#   &_SeqOrigPreload
#   &SeqOriginal
#   &_SeqOrigCheck
#   &SeqExtend
#   &SeqOrigClear
#
# - project I/O, meta-handling
#   &AlnprojGet
#
# - AlnK format input
#   project index data structure
#   &_AlnprojClustalList
#   &_AlnprojClustalTrue
#   &_AlnprojClustalChoose
#   &AlnprojClustalOpen
#
# - AlnK format output
#   $_LibParam{*}
#   &AlnprojIdcomplex
#   &AlnprojClustalSprint
#   &AlnprojSprint
#   &AlnprojToFile
#
# - project parameter I/O
#   &_AlnprojParamRead
#   &_AlnprojParamDebug
#   &AlnprojParamSprint
#   &AlnprojName
#
# - GAP4 directed assembly I/O
#   &AlnprojGapdirOpen
#
# - project operations
#   &AlnprojClone
#   &AlnprojSplit
#   &AlnprojSplit3
#   &AlnprojJoin
#   &AlnprojEntryDel
#
# - project sequence operations
#   &AlnprojExtensHide
#   &AlnprojExtensValid
#   &AlnprojConsUpdate
#
#
#  STD OPTIONS
#
#   -debug      print debug protocol to STDERR
#   -timer      print time performance protocol to STDERR
#
################################################################################

package SeqLab::AlnProj;

# includes
use strict; #use warnings;  # OK 20060222
use MainLib::StrRegexp;
use MainLib::Data;
use MainLib::Path;
use MainLib::Cmdline qw(&QueryConfirm);
use MainLib::File qw(&ReadFile);
use MainLib::Misc;
use Math::Calc;
use database::DbPlain qw(&PlainToTable);
use SeqLab::SeqBench;
use SeqLab::SeqCompOld qw(&SeqStretchRepair);
use SeqLab::SeqFormat;
use SeqLab::SeqStreamIn;
use SeqLab::SeqAnalysis;
use SeqLab::Align;
use SeqLab::Blast qw(%BlastDefault);
use ReadWatch::ReadIndex;

# symbol export
use Exporter qw(import);
our @EXPORT = qw (
  &SeqOriginal &SeqExtend &SeqOrigClear
  &AlnprojGet
  &AlnprojClustalOpen
  &AlnprojIdcomplex &AlnprojClustalSprint &AlnprojSprint &AlnprojToFile
  &AlnprojName &AlnprojParamSprint
  &AlnprojClone &AlnprojSplit &AlnprojSplit3 &AlnprojJoin &AlnprojEntryDel
  &AlnprojExtensHide &AlnprojExtensValid &AlnprojConsUpdate
  );

# package-wide constants and variables
our %LibGlob;
my %_LibParam;


################################################################################
# basics
################################################################################


# alignment project data structure
#
# %Project with keys:
#   Path       path of project file
#   FileMemb   list of projects joining the project file
#   DocAnte    document part preceding project
#   DocPost    document part following project
#   name       name of project
#   changes    flag, that changes have been made to the project
#   Param      project parameters
#   align      reference to alignment data structure (cf. basic structure
#              definition in SeqLab::SeqFormat.pm) which is an array of
#              hashes %Entry, see description below.
#   consens    reference to consensus sequence string in @Align
#
# %Entry
#   alignment data structure is much the same like in SeqLab::Align.pm,
#   but with some expansions. There are keys as follows:
#   id         pure sequence identifier (gid preferred)
#   idcomplex  complex sequence identifier string containing: id, offset in
#              alignment, strandedness in alignment
#   offset     sequence stretch position in the original sequence. For
#              relative strandedness 'R' the position value is the lower
#              one, means: Take sequence stretch from offset to
#              offset+length-1, reverse-complement it and that's it!
#              Offset values of the sequence entries should refer to the
#              validated part of the sequence stretch (big letters).
#   strand     strandedness of the sequence stretch in the alignment ('F':
#              forward, 'R': reverse)
#   sequence   sequence string. Upper/lower case of characters is quite
#              meaningful! (cmp. field 'offset').
#


# current package version
# this is most important for interpretation of data syntaxes
#
our $VERSION =
$LibGlob{Version} = '1.001';


# project parameter defaults:
#
$_LibParam{ParamDflt} = {
  BlastEntry => {
    nucleotide => {
      Database          => $BlastDefault{DbHome} .'/'. $BlastDefault{DbDictyAll},
      ParameterSet      => 'SimCluster',
      Program           => $BlastDefault{ProgStdNt},
      ThresholdIdentity => 0.90,
      },
    protein => {
      Database          => $BlastDefault{DbHome} .'/'. $BlastDefault{DbDictyProt},
      ParameterSet      => 'Default',
      Program           => $BlastDefault{ProgStdProt},
      ThresholdIdentity => 0.85,
      },
    },
  Report =>  {
    SymbolAnalysis => {
      MaskGapEnd    => 1,
      MaskGapInside => 150,
      },
    },
  SequenceReference => {
    Type    => 'ExperIndex',
    Path    => $ReadWatch::ReadIndex::LibGlob{default}{index},
    BaseDir => $ReadWatch::ReadIndex::LibGlob{default}{base}{exper},
    },
  };
$_LibParam{ParamDflt}{BlastEntry}{DNA} = $_LibParam{ParamDflt}{BlastEntry}{RNA} = $_LibParam{ParamDflt}{BlastEntry}{nucleotide};


# project file format - string definitions
#
$_LibParam{PhraseProjOff} = 'AlnK Alignment Project';
$_LibParam{PhraseProjEnd} = '_____AlnK_Project_End_____';


################################################################################
# sequence reference I/O
################################################################################


# global container of sequence originals
#
our %SeqOrig = ();
my $_poSeqIndex = undef;


# load sequence strings from original source into global storage
#
# INTERFACE
# - argument 1: reference to project data
#
# - options:
#   -debug      [STD]
#
sub _SeqOrigPreload {
  my ($pProj,%opt) = @_;
  my $debug = $main::ProgOpt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;
  my @SeqRef = @{$$pProj{param}{SequenceReference}};
  unless (int(@SeqRef) and %{$SeqRef[0]}) { return }

  my ($pSeqRef, %MatchID, @path, $pSeq, @seq);

  # find starting point in sequence array, skip consensus
  my $CtI=0;
  if ($$pProj{align}[0]{id} =~ m/$reAlnConsens/io) { $CtI ++ }
  if (!@{$$pProj{align}} or $#{$$pProj{align}}<$CtI) {
    $debug and printf STDERR "%s. WARNING: no sequence entries in alignment, nothing to do here\n", &MySub;
    return;
  }

  # work out ID array selector
  %MatchID = map { ($_->{id}=>1) } @{$$pProj{align}};
  if (keys %MatchID) {
    $debug and printf STDERR "%s. using ID matching selector:\n  %s\n", &MySub,
      join (' ', keys %MatchID);
  } else {
    printf STDERR "%s. CODE ERROR: no resulting identifier selector:\n", &MySub;
    return;
  }

  # foreach sequence reference: reduce number of missing sequences
  while (keys(%MatchID) and ($pSeqRef=shift(@SeqRef))) {

    # debug: sequence reference
    if ($debug) {
      printf STDERR "%s. data source parameters:\n", &MySub;
      printf STDERR "  source type:  %s\n", $$pSeqRef{Type}||"''";
      printf STDERR "  source path:  %s\n", $$pSeqRef{Path}||"''";
      if ($$pSeqRef{Type} eq 'ExperIndex') {
      printf STDERR "  base path:    %s\n", $$pSeqRef{BaseDir}||"''";
      }
    }

    # read sequence array from fastA format source
    if ($$pSeqRef{Type} eq 'fastA') {
      my $paSeqAdd = &SeqarrFromFFmt ($$pSeqRef{Path},
        -format  => 'fastA',
        -MatchID => \%MatchID,
        -pure    => 1,
        -debug   => $dbg2);
      # register what we've added now
      foreach (@$paSeqAdd) {
        delete $MatchID{$_->{id}};
      }
      # add sequences
      push @seq, @$paSeqAdd;
    }

    # read sequence array from Experiment format source (directory)
    elsif ($$pSeqRef{Type} eq 'ExperDir') {
      $$pSeqRef{Path} =~ s|/*$|/|;
      foreach (keys %MatchID) {

        # read sequence from Experiment file
        if ($pSeq = &{$SeqFFmtGet{Experiment}{FuncEntry}} ($$pSeqRef{Path} . $_,
          -ClipQuality=>1, -ClipUnk=>undef, -pure=>undef, -debug=>$dbg2)) {
          # register what we've added now
          delete $MatchID{$$pSeq{id}};

          # purifying sequence string is fastest this way (after &{$SeqFFmtGet{Experiment}{FuncEntry}})
          # also change to upper case
          $$pSeq{sequence} = &SeqStrPure ($$pSeq{sequence}, -upper=>1);
          push @seq, $pSeq;
        }
      }
    }

    # read sequence array from Experiment format source (index)
    elsif ($$pSeqRef{Type} eq 'ExperIndex') {

      # get array of file paths from read index
      $_poSeqIndex = ReadWatch::ReadIndex->new($$pSeqRef{Path});
      @path = $_poSeqIndex->Retrieve ('ExperSub', keys %MatchID);
      foreach (@path) {

        # read sequence from Experiment file
        $debug and printf STDERR "%s. loading Experiment file %s\n", &MySub, $$pSeqRef{BaseDir}.$_||"''";
        $pSeq = &{$SeqFFmtGet{Experiment}{FuncEntry}} ($$pSeqRef{BaseDir} . $_,
          -ClipQuality=>1, -ClipUnk=>1, -pure=>undef, -debug=>$dbg2);
        delete $MatchID{$$pSeq{id}};

        # purifying sequence string is fastest this way (after &{$SeqFFmtGet{Experiment}{FuncEntry}})
        # also change to upper case
        $$pSeq{sequence} = &SeqStrPure ($$pSeq{sequence}, -upper=>1);
        push @seq, $pSeq;
      }
    }

    elsif ($debug) {
      printf STDERR "%s. ERROR: unknown format type of reference database: %s\n", &MySub, $$pSeqRef{Type}||"''";
    }

    # debug: preload result
    if ($debug) {
      if (@seq) {
        printf STDERR "%s. result of preload cycle:\n", &MySub;
        printf STDERR "  got original sequences:     %d\n", int(@seq);
        printf STDERR "  missing original sequences: %d\n", int(keys %MatchID);
        (keys %MatchID) and
        printf STDERR "    %s\n", join(' ',keys(%MatchID));
      } else {
        printf STDERR "%s. no original sequences preloaded\n", &MySub;
      }
    }

  }  # while keys %MatchID

  # store sequence array in global storage
  foreach (@seq) {
    if ($debug and int(@seq)<50) {
      printf STDERR "%s. preloaded original sequence for ID %s, length %d\n", &MySub,
        $_->{id}||"''", length($_->{sequence});
    }
    $SeqOrig{$_->{id}} = $_->{sequence};
  }
}


# return sequence from original source
#
# INTERFACE
# - argument 1: reference to project data structure
# - argument 2: sequence identifier
#
# - options:
#   -debug      [STD]
#
# - return val: - reference to sequence data structure
#               - undef if an error occurred
#
# DESCRIPTION
# - The sequence is either retrieved from the global hash %SeqOrig or it's
#   retrieved from the original data source(s) (specified by entry in
#   @{$$pProj{param}{SequenceReference}}).
#
sub SeqOriginal {
  my ($pProj, $SeqID, %opt) = @_;
  my ($debug, $dbg2, @SeqRef);
  my ($pSeqRef, $SeqPath, $pSeq);

  # function parameters
  # source defaults are assigned in &_AlnprojParamRead
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  @SeqRef = @{$$pProj{param}{SequenceReference}};

  # assure sequences are present in global storage (=> &_SeqOrigPreload)
  # get sequence from global storage
  if (!%SeqOrig and int(@SeqRef) and %{$SeqRef[0]}) {
    $debug and printf STDERR "%s. global sequence storage empty. Doing primary preload\n", &MySub;
    &_SeqOrigPreload ($pProj, -debug=>$dbg2);
  }
  $debug and printf STDERR "%s. %d entries in global sequence storage now\n", &MySub, int(keys %SeqOrig);
  if (defined $SeqOrig{$SeqID}) {
    $debug and printf STDERR "%s. returning sequence %s, length %d\n", &MySub, $SeqID||"''", length($SeqOrig{$SeqID});
    return {
      id       => $SeqID,
      sequence => $SeqOrig{$SeqID},
      };
  }
  $debug and printf STDERR "%s. didn't find sequence %s in global storage\n", &MySub, $SeqID||"''";

  # get sequence individually from master source(s)
  while (!$pSeq and ($pSeqRef=shift(@SeqRef))) {

    # read sequence array from fastA format source
    if (exists($$pSeqRef{Type}) and $$pSeqRef{Type} eq 'fastA') {
      $pSeq = &SeqentryPopFasta ($$pSeqRef{Path}, -MatchID=>[$SeqID], -pure=>1, -debug=>$dbg2);
    }

    # read sequence array from Experiment format source (directory)
    elsif ($$pSeqRef{Type} eq 'ExperDir') {
      $$pSeqRef{Path} =~ s|/*$|/|;

      # read sequence from Experiment file
      $pSeq = &{$SeqFFmtGet{Experiment}{FuncEntry}} ($$pSeqRef{Path} . $SeqID,
        -ClipQuality=>1, -ClipUnk=>undef, -pure=>undef, -debug=>$dbg2);

      # purifying sequence string is fastest this way (after &{$SeqFFmtGet{Experiment}{FuncEntry}})
      # also change to upper case
      $$pSeq{sequence} = &SeqStrPure ($$pSeq{sequence}, -upper=>1);
    }

    # read sequence array from Experiment format source (index)
    elsif ($$pSeqRef{Type} eq 'ExperIndex') {

      # get file path from read index
      $_poSeqIndex ||= ReadWatch::ReadIndex->new($$pSeqRef{Path});
      if ($SeqPath = ( $_poSeqIndex->Retrieve ('ExperSub', $SeqID) )[0] ) {

        # read sequence from Experiment file
        $pSeq = &{$SeqFFmtGet{Experiment}{FuncEntry}} ($$pSeqRef{BaseDir} . $SeqPath,
          -ClipQuality=>1, -ClipUnk=>1, -pure=>undef, -debug=>$dbg2);

        # purifying sequence string is fastest this way (after &{$SeqFFmtGet{Experiment}{FuncEntry}})
        # also change to upper case
        $$pSeq{sequence} = &SeqStrPure ($$pSeq{sequence}, -upper=>1);
      }
    }

    # unknown reference type
    else {
      printf STDERR "%s. ERROR: unknown format type of sequence reference: %s\n", &MySub, $$pSeqRef{Type}||"''";
    }
  }

  # no database entry?
  # unretrievable sequences are entered in the storage with sequence
  #   string being ''
  unless ($$pSeq{sequence}) {
    $debug and printf STDERR "%s. no database entry for ID %s\n", &MySub, $SeqID||"''";
    $SeqOrig{$SeqID} = '';
    return undef;
  }

  # store sequence, return
  $SeqOrig{$SeqID} = $$pSeq{sequence};
  return $pSeq;
}


# check consistency of alignment entries compared to original sequences
#
# INTERFACE
# - argument 1: project reference
#
# - options:
#   -debug      [STD]
#   -dialog     enable dialogue for user interaction
#
# - return val: list of erroneous entries (IDs)
#
# DEBUG, CHANGES, ADDITIONS
# - syntax for sequence orientation is rather old-fashioned.
#
sub _SeqOrigCheck {
  my ($pProj, %opt) = @_;
  my ($debug, $dbg2);
  my ($pSeq, $pSeqOrig, @RemoveAlnEntry, $SeqStretch);
  my ($bClip1, $CtI);

  # function parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;

  # check, loop over alignment entries
  foreach $pSeq (@{$$pProj{align}}) {

    # consensus sequence is not checked
    if ($$pSeq{id} eq 'consensus') { next }

    # get master sequence
    $debug and printf STDERR "%s. retrieving original sequence for entry $$pSeq{id}\n", &MySub;
    $pSeqOrig = &SeqOriginal ($pProj, $$pSeq{id}, -debug=>$dbg2);

    # no database entry? => remove entry?
    unless ($$pSeqOrig{sequence}) {
      unless ($main::ProgOpt{-script}) {
        print "missing original database sequence for alignment entry $$pSeq{id}\n";
        if ($opt{-dialog}) {
          print "  remove sequence entry?";
          if (&QueryConfirm()) {
            push @RemoveAlnEntry, $$pSeq{idcomplex};
          }
        }
      }
      next;
    }

    # change of ID?
    if ($$pSeqOrig{id} and $$pSeqOrig{id} ne $$pSeq{id}) {
      print "identifier changed from $$pSeq{id} to $$pSeqOrig{id}\n";
      $$pSeq{id} = $$pSeqOrig{id};
      $$pProj{changes} = 1;
    }

    # check / update sequence offset and strandedness of pure sequence
    $SeqStretch = ($$pSeq{orient} eq 'F') ? $$pSeq{sequence} : &SeqStrRevcompl ($$pSeq{sequence});
    $SeqStretch =~ s/[a-z]/-/g;
    $SeqStretch = &SeqStrPure ($SeqStretch, -upper=>1);
    if (($CtI = index ($$pSeqOrig{sequence}, $SeqStretch) + 1) != $$pSeq{offset}) {
      unless ($CtI) {
        $SeqStretch = &SeqStrRevcompl ($SeqStretch);
        unless ($CtI = index($$pSeqOrig{sequence},$SeqStretch)+1) {

          # error => try to repair
          if ($bClip1) {
            print "performing stretch repair for entry $$pSeq{id}\n";
            if ($SeqStretch = &SeqStretchRepair (\$$pSeqOrig{sequence}, \$$pSeq{sequence}, -debug=>$dbg2)) {
              $$pSeq{sequence} = $SeqStretch;
              $$pProj{changes} = 1;
            } else {
              print "entry $$pSeq{id}: non-reparable differences between sequence and original sequence\n";
              $debug and printf STDERR "ID %s\n  entry    %s\n  original %s\n",
                $$pSeq{id}, $SeqStretch, $SeqOrig{$$pSeq{id}};
              if ($opt{-dialog}) {
                print "  remove sequence entry?";
                if (&QueryConfirm()) {
                  push @RemoveAlnEntry, $$pSeq{idcomplex};
                }
              }
            }
            undef $bClip1;
            next;

          # overcome former errors in the sequence database (clipped ends)
          } else {
            printf "%s. applying 1-bp-clip rule to entry %s\n", &MySub, $$pSeq{id};
            $$pSeq{sequence} =~ s/^(-*)[A-Z](.*)[A-Z]$/$1-$2/;
            $bClip1 = 1;
            redo;
          }

        # correct strandedness and positioning
        } else {
          unless ($main::ProgOpt{-script}) {
            printf "%s %s: strandedness corrected to %s, offset %d\n",
              $$pSeq{id}, ($$pSeq{orient} eq 'F') ? 'fwd' : 'rev',
              $$pSeq{orient}||"''", $CtI;
          }
          $$pSeq{orient} = ($$pSeq{orient} eq 'F') ? 'R' : 'F';
          $$pSeq{offset} = $CtI;
          $$pProj{changes} = 1;
        }

      # correct positioning
      } else {
        unless ($main::ProgOpt{-script}) {
          printf "%s %s: offset corrected from %d to %d\n",
            $$pSeq{id}, ($$pSeq{orient} eq 'F') ? 'fwd' : 'rev',
            $$pSeq{offset}, $CtI;
        }
        $$pSeq{offset} = $CtI;
        $$pProj{changes} = 1;
      }
    }
    undef $bClip1;
  }

  # return erroneous entries (IDs)
  return (@RemoveAlnEntry);
}


# return sequence extension from original source
#
# INTERFACE
# - argument 1: reference to sequence entry
# - argument 2: end specification
#               -1   left end
#               >=0  right end
#
# - options:
#   -debug      [STD]
#
# - return val: - sequence extension string or
#               - undef if an error occurred
#
# DEBUG, CHANGES, ADDITIONS
# - have a discussion about counting in computational or biological system.
#
sub SeqExtend {
  my ($pProj, $pSeqEntry, $end, %opt) = @_;
  my ($debug, $dbg2);
  my ($pSeqOrig, $sSeqOrig, $LenOrig, $LenSeq, $StrExtens);

  # function parameters
  $debug = $opt{-debug};  # || ($$pSeqEntry{id} =~ m/JC2e17c06/);
  $dbg2  = $debug ? $debug-1 : undef;
  if ($debug) {
    printf STDERR "%s. primary parameters:\n", &MySub;
    print  STDERR "  ID: $$pSeqEntry{id}\n";
    printf STDERR "  strand: $$pSeqEntry{orient}\n";
    printf STDERR "  stretch end to extend: %s\n", ($end < 0) ? 'left' : 'right';
    printf STDERR "  stretch offset: $$pSeqEntry{offset}\n";
  }

  # calculate current stretch length
  if ($$pSeqEntry{sequence} =~ m/([A-Z]-*)+/) {
    $LenSeq = length (&SeqStrPure ($&)) or return undef;
  } else {
    return undef;
  }

  # get original sequence
  $pSeqOrig = &SeqOriginal ($pProj, $$pSeqEntry{id}, -debug=>$dbg2);
  unless ($sSeqOrig = $$pSeqOrig{sequence}) {
    return undef;
  }

  # work out left extension string
  # $$pSeqEntry{offset} should contain a correct value
  if ($end < 0) {
    if ($$pSeqEntry{orient} eq 'F') {
      $StrExtens = substr ($sSeqOrig, 0, $$pSeqEntry{offset}-1);
    } else {
      if (($LenOrig=length($sSeqOrig)) > $$pSeqEntry{offset}-1+$LenSeq) {
        $StrExtens = substr ($sSeqOrig, $$pSeqEntry{offset}-1+$LenSeq, $LenOrig);
        $StrExtens = &SeqStrRevcompl ($StrExtens);
      }
    }
  }

  # work out right extension string
  # $$pSeqEntry{offset} should contain a correct value
  else {
    if ($$pSeqEntry{orient} eq 'F') {
      if (($LenOrig=length($sSeqOrig)) > $$pSeqEntry{offset}-1+$LenSeq) {
        $StrExtens = substr ($sSeqOrig, $$pSeqEntry{offset}-1+$LenSeq, $LenOrig);
      }
    } else {
      $StrExtens = substr ($sSeqOrig, 0, $$pSeqEntry{offset}-1);
      $StrExtens = &SeqStrRevcompl ($StrExtens);
    }
  }

  # debug
  if ($debug) {
    printf STDERR "%s. Secondary parameters:\n", &MySub;
    printf STDERR "  stretch length: %d\n", $LenSeq;
    print  STDERR "  original length: $LenOrig\n";
    printf STDERR "  extension length: %d\n", length $StrExtens;
    printf STDERR "  entry sequence: %s\n", $$pSeqEntry{sequence};
    printf STDERR "  original sequence: %s\n", $sSeqOrig;
  }

  # exit SUB
  return $StrExtens;
}


# clear global container of sequence originals
#
# INTERFACE
# *NONE*
#
sub SeqOrigClear {
  %SeqOrig  = ();
  $_poSeqIndex = undef;
}


################################################################################
# project I/O, meta-handling
################################################################################


# open project from any supported input format
#
# INTERFACE
# - argument 1: path of source file
# - argument 2: project name, see &AlnprojClustalOpen for details
#
# - options:
#   -debug      [STD]
#   -dialog     enable dialogue for user interaction
#   -noDB       suppress dialogue with the database. Sequences won't be
#               checked for integrity, offset values aren't updated.
#   -timer      [STD], option is handed over to subroutines.
#
# - return val: - reference to project data
#               - undef if an error occurred.
#
# DESCRIPTION
# - check referenced input file for following contents (in this order):
#   - GAP directed assembly
#   - alignment project file or Clustal W alignment (array)
# - if a project name is not defined its queried from the user
#   (if allowed, cmp. option -dialog)
#
sub AlnprojGet {
  my ($PathIn, $NameProj, %opt) = @_;
  my ($debug, $dbg2);
  my ($PlainDoc);
  my ($paProj, $NameUndef, $pProj);

  # function parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;

  # read source file
  $PathIn = &PathExpand ($PathIn);
  unless ($PlainDoc = &ReadFile($PathIn)) {
    ($opt{-dialog} or $debug) and printf "%s. ERROR: unable to read input file %s\n", &MySub, $PathIn||"''";
    return undef;
  }

  # file of IDs? => GAP directed assembly
  # - match first non-comment line
  # - trailing "?" is needed to restrict match to one single line
  if ($PlainDoc =~ m/^[^#]+?$/m) {
    my $Fofn1stId = $&;
    my $FofnDir = (&PathSplit($PathIn)||{})->{dir};
    my $Fofn1stPath = "$FofnDir/$Fofn1stId";
    if ($debug) {
      printf STDERR "%s. test for GAP directed assembly\n", &MySub;
      printf STDERR "  primary input file: %s\n", $PathIn||"''";
      printf STDERR "  directory: %s\n", $FofnDir||"''";
      printf STDERR "  testing first ID: %s\n", $Fofn1stId||"''";
      printf STDERR "  resulting file: %s\n", $Fofn1stPath||"''";
    }
    if (-r($Fofn1stPath) and &SeqFileFormat($Fofn1stPath) eq 'Experiment') {
      $debug and print  STDERR "  is directed assembly: YES\n";
      $pProj = &AlnprojGapdirOpen ($FofnDir,
        &PlainToTable(\$PlainDoc,-TabType=>'A1',-wantscalar=>1,-debug=>$dbg2), %opt);
      return $pProj;
    } else {
      $debug and print  STDERR "  is directed assembly: NO\n";
    }
  } # don't continue with "elsif"

  # true alignment project or Clustal W alignment
  if ($PlainDoc =~ m/$reClustalHead\n/o) {
    $paProj = &_AlnprojClustalList (\$PlainDoc);

    # force to true project array (if dialogue is allowed)
    foreach (@$paProj) {
      unless ($_->{name}) {
        if ($opt{-dialog}) {
          &_AlnprojClustalTrue ($paProj, \$PlainDoc, -debug=>$dbg2);
          last;
        } else {
          $NameUndef = 1;
        }
      }
    }

    # define target project
    unless ($NameProj) {
      if (@$paProj > 1) {
        if ($opt{-dialog}) {
          $NameProj = &_AlnprojClustalChoose ($paProj);
        }
        else {
          $debug and printf STDERR "%s. ERROR: several projects (%d) in file, but no project specified\n", &MySub,
            int @$paProj;
          return undef;
        }
      } else {
        $NameProj = $$paProj[0]{name};
      }
    }

    # read target project
    $opt{-debug} = $dbg2;
    $pProj = &AlnprojClustalOpen ($PathIn, $NameProj,
      -PlainDoc=>$PlainDoc, -index=>$paProj, %opt);
    $$pProj{name} ||= &PathSplit ($PathIn)->{nameroot};
    return $pProj;
  }

  # unknown input type
  else {
    $debug and printf STDERR "%s. ERROR: unknown source type\n", &MySub;
    return undef;
  }
}


################################################################################
# AlnK format input
################################################################################


# project index data structure
#
# each index entry represented by a hash with following keys:
#  LenAnte  length of string preceding that project
#  name     project name
#  offset   offset position of project in project file
#  OffLine  offset line number of project in project file
#           this information is given only for no-name Clustal W
#           projects.
#  length   project string length
#  OffPost  offset of string following that project
#  changes  changes have been made to the document structure
#


# project file format - string definitions
# see on top


# make index of projects or Clustal W alignments in a project file
#
# INTERFACE
# - argument 1: reference to project plain text
#
# - options:
#   -debug      [STD]
#
# - return val: reference to list of projects (array of hashes), description
#               see above.
#
# DESCRIPTION
# - first, the function searches for true alignment projects. If there're
#   none, it looks for Clustal W alignments.
#
sub _AlnprojClustalList {
  my ($pPlainDoc, %opt) = @_;
  my $debug = $opt{-debug};

  # match to project delimiters
  my @aProj;
  while ($$pPlainDoc =~ m/$_LibParam{PhraseProjOff} \'([^\n\',]*?)\'\n(.*?)\n$_LibParam{PhraseProjEnd} \'\1\'(\n|$)/gos) {
    push @aProj, {
      LenAnte => length $`,
      name    => $1,
      offset  => length($`) + length($_LibParam{PhraseProjOff}) + 4 + length($1),
      OffLine => &MatchCt (\(substr($$pPlainDoc,0,length($`))), '\n') + 1,
      length  => length $2,
      OffPost => length($`) + length($&),
      source  => 'AlnK',
      };
    $debug and printf STDERR "%s. found project %s\n", &MySub, $1||"''";
  }

  # no projects? => extract Clustal W alignments
  # Clustal W header will be part of the indexed plain text
  unless (@aProj) {
    my @Clustal = split (/($reClustalHead\n)/om, $$pPlainDoc);
    my $CtAnte = length (shift @Clustal);
    my $ClustalBlock;
    while ($ClustalBlock = shift(@Clustal).shift(@Clustal)) {
      my ($ProjPosEnd,$ProjLenPost) = (0,0);
      while ($ClustalBlock =~ m/$reClustalEnd/og) {
        $ProjPosEnd = (length $`) + (length $&);
        $ProjLenPost = length $';
      }
      push @aProj, {
        LenAnte => $CtAnte,
        name    => '',
        offset  => $CtAnte,
        OffLine => (&MatchCt( \(substr($$pPlainDoc,0,$CtAnte)) ,'\n')||0) + 1,
        length  => $ProjPosEnd,
        OffPost => $CtAnte + $ProjPosEnd,
        source  => 'Clustal W',
        };
      $CtAnte += length($ClustalBlock);
    }
  }

  # report projects or alignments (debug)
  if ($debug) {
    if (@aProj) {
      printf STDERR "%s. found following project entries:\n", &MySub;
      foreach (@aProj) {
        print  STDERR "  project entry: project \'$_->{name}\', source $_->{source}, ",
          "ante $_->{LenAnte}, line $_->{OffLine}, pos $_->{offset}, len $_->{length}, ",
          "post pos $_->{OffPost}\n";
      }
    } else {
      print "no project entry found\n";
    }
  }

  # exit SUB
  return \@aProj;
}


# convert bunch of Clustal W alignments to a project file with true projects
#
# INTERFACE
# - argument 1: reference to project list as worked out by &_AlnprojClustalList
#               The referenced data is explicitly changed. A copy of the
#               reference is returned by the function.
# - argument 2: reference to plain text
#               The plain text is changed in that respect that the contained
#               Clustal W projects are bracketed with project delimiters.
#
# - options:
#   -debug      [STD]
#
# - return val: reference to index of projects, like argument 1. Now
#               having updated 'name' fields for each entry. The values
#               of the fields 'LenAnte', 'offset', 'OffLine' 'OffPost',
#               are updated according to the changes in the document.
#
# DESCRIPTION
# - the function is user-interactive in that it asks for project identifiers.
# - the validated projects are not saved back to the disk space.
#
sub _AlnprojClustalTrue {
  my ($paProj, $pPlainDoc, %opt) = @_;
  my ($debug);
  my ($NameProj, $pEntry, $ProjHead, $ProjTail, $LenNmb, $CtAdd, $CtI);

  # function parameters
  $debug = $opt{-debug};

  # list projects
  print "no AlnK alignment projects available, here's a listing of Clustal W projects:\n";
  $LenNmb = length (int (@$paProj));
  for ($CtI=0; $CtI<int(@$paProj); $CtI++) {
    printf "project %${LenNmb}d: Clustal W alignment, starting at line %d\n",
      $CtI+1, $$paProj[$CtI]{OffLine};
  }

  # get project names
  printf "enter a name for %sproject:\n", (@$paProj>1)?'each ':'';
  for ($CtI=0; $CtI<int(@$paProj); $CtI++) {
    printf "%${LenNmb}d> ", $CtI+1;
    chop ($NameProj = <STDIN>);
    &AlnprojName ($$paProj[$CtI], $NameProj);
  }

  # bracket projects with project delimiters, change index values
  $CtAdd = 0;
  foreach $pEntry (@$paProj) {
    $$pEntry{LenAnte} += $CtAdd;
    $ProjHead = sprintf ("%s \'%s\'\n",
      $_LibParam{PhraseProjOff},
      $$pEntry{name});
    substr ($$pPlainDoc, $$pEntry{LenAnte}, 0) = $ProjHead;
    $CtAdd += length $ProjHead;
    $$pEntry{offset}  += $CtAdd;
    $ProjTail = sprintf ("\n%s \'%s\'\n",
      $_LibParam{PhraseProjEnd},
      $$pEntry{name});
    substr ($$pPlainDoc, $$pEntry{offset}+$$pEntry{length}, 0) = $ProjTail;
    $CtAdd += length $ProjTail;
    $$pEntry{OffPost} += $CtAdd;
    $$pEntry{OffLine}  = (&MatchCt( \(substr($$pPlainDoc,0,$$pEntry{LenAnte})) ,'\n')||0) + 1;
    $$pEntry{changes}  = 1;
  }

  # debug
  if ($debug) {
    printf STDERR "%s. newly structured alignment projects:\n", &MySub;
    foreach $pEntry (@$paProj) {
      print  STDERR substr ($$pPlainDoc, $$pEntry{LenAnte}, $$pEntry{length});
      print  STDERR '+' x 60, "\n";
    }
  }

  # exit SUB
  return $paProj;
}


# user's choice for project from document
#
# INTERFACE
# - argument 1: reference to project list as worked out by &_AlnprojClustalList
# - return val: name of chosen project.
#
sub _AlnprojClustalChoose {
  my ($paProj, %opt) = @_;
  my ($NmbChoice, $NameProj, $LenNmb, $CtI);

  # only one project => choose that project, name may be given later (in
  #   course of a session)
  if (int(@$paProj) == 1) {
    return $$paProj[0]{name};
  }

  # list projects
  print "Available AlnK alignment projects:\n";
  $LenNmb = length (int (@$paProj));
  for ($CtI=0; $CtI<int(@$paProj); $CtI++) {
    printf "%${LenNmb}d. project \'%s\', starting at line %d\n",
      $CtI+1, $$paProj[$CtI]{name},
      $$paProj[$CtI]{OffLine};
  }

  # get user's choice
  $NmbChoice = int (@$paProj);
  while (! ($NameProj = $$paProj[$NmbChoice]{name})) {
    print "choose project (enter number)> ";
    chop ($NmbChoice = <STDIN>);
    if ($NmbChoice !~ m/\d+/) { redo }
    else { $NmbChoice = $& - 1; }
  }

  # exit SUB
  return $NameProj;
}


# open alignment project
#
# INTERFACE
# - argument 1: path of project file
# - argument 2: project name
#               using undef or '' as project name forces the SUB to open the
#               first anonymous project in the project file.
#
# - options:
#   -debug      [STD]
#   -dialog     enable dialogue for user interaction
#   -index      supply project index for given source file. This option
#               should be used in conjunction with option -PlainDoc.
#   -noDB       suppress dialogue with the database. Sequences won't be
#               checked for integrity, offset values aren't updated.
#   -PlainDoc   supply plain document that's already been read from file
#   -timer      [STD]
#
# - return val: - reference to project data
#               - undef if an error occurred.
#
# DESCRIPTION
# - read project data from project file or alignment(s) in Clustal W format.
# - Integrity of sequence stretches is checked against the original sequences
#   from the reference database
#   - unusual offset
#   - missing original sequence
#   - differences between aligned and original sequence
#     in case of differences the function tries to do a repair via
#     SeqStretchRepair in SeqLab::SeqAnalysis.pm.
# - for definition of alignment project data structure see SeqLab::AlnProj.pm.
#
sub AlnprojClustalOpen {
  my ($PathIn, $NameProj, %opt) = @_;
  my ($debug, $dbg2, $bTimer, $time);
  my ($PlainDoc, $paProj, $ProjPlain, $AlnprojParamPlain, $pProj);
  my ($id, $offset, $orient);
  my ($bClip1, $pSeqOrig, $SeqStretch, @RemoveAlnEntry);
  my ($CtI);

  # function parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $bTimer = $opt{-timer};
  $PathIn = &PathExpand ($PathIn);

  # read source file, get index of projects
  $bTimer and $time = (times)[0];
  $PlainDoc = $opt{-PlainDoc} || &ReadFile($PathIn);
  unless ($PlainDoc) {
    printf STDERR "%s. ERROR: unable to read file %s\n", &MySub, $PathIn||"''";
    return undef;
  }
  $paProj = $opt{-index} || &_AlnprojClustalList(\$PlainDoc);
  if (!int(@$paProj) or ($NameProj and !$$paProj[0]{name})) {
    # no project at all or specified project not found
    return undef;
  }
  $bTimer and printf STDERR "%s. CPU time for file reading and indexing: %.3f s\n", &MySub, (times)[0] - $time;

  # pick project from available projects
  foreach (@$paProj) {
    if ($NameProj eq $_->{name}) {
      $pProj = {
        Path    => $PathIn,
        source  => $_->{source},
        DocAnte => substr ($PlainDoc, 0, $_->{LenAnte}),
        name    => $_->{name},
        DocPost => substr ($PlainDoc, $_->{OffPost}, length $PlainDoc),
        changes => $_->{changes},
        };
      $ProjPlain = substr ($PlainDoc, $_->{offset}, $_->{length});
      last;
    }
  }

  # split plain project to plain parameters and plain Clustal W passage
  if ($ProjPlain =~ m/$reClustalHead/o) {
    $ProjPlain  = $';
    $AlnprojParamPlain = $`;
    if ($AlnprojParamPlain =~ m/\n>.*?consensus/i) {
      $AlnprojParamPlain = $`;
    }
  } else {
    printf STDERR "%s. ERROR: missing Clustal alignment block in project plain text\n", &MySub;
    return undef;
  }

  # get sequence entries from alignment plain text
  $bTimer and $time = (times)[0];
  $$pProj{align} = &AlnClustalRead (\$ProjPlain, -KeyId=>'idcomplex', -ConsDel=>1, -timer=>$bTimer, -debug=>$dbg2);
  if ($debug) {
    printf STDERR "%s. found %d entries in Clustal W alignment\n", &MySub, int @{$$pProj{align}};
    printf STDERR "%s. first entry: %s\n", &MySub, $$pProj{align}[0]{idcomplex};
  }
  $bTimer and printf STDERR "%s. CPU time for project parsing: %.3f s\n", &MySub, (times)[0] - $time;

  # get information fields from complex identifier
  foreach (@{$$pProj{align}}) {

    # split identifier / pos+strand
    if ($_->{idcomplex} =~ m/^(.*[^_])_+/) {
      $id = $1;
      $offset = $';

      # split alignment offset / strand
      if ($offset =~ m/^(\d+)([frs]?)$/i) {
        $offset = $1;
        $orient = $2;
        unless ($orient) {
          $orient = 'F';
          $$pProj{changes} = 1;
        }
        if ($orient eq 'S') {
          $orient = 'F';
          $$pProj{changes} = 1;
        }
        $orient =~ tr/a-z/A-Z/;
      } else {
        unless ($main::ProgOpt{-script}) {
          print  STDERR "WARNING: unusual identifier $_->{idcomplex}, assigning standard values\n";
        }
        $offset = 1;
        $orient = 'F';
        $$pProj{changes} = 1;
      }
    } else {
      $id = $_->{idcomplex};
      $offset = 1;
      $orient = 'F';
      $$pProj{changes} = 1;
    }

    # enter entry
    $_->{id}     = $id;
    $_->{offset} = $offset;
    $_->{orient} = $orient;
  }

  # parse project parameters
  # sequence entries are needed to define a default sequence type
  $$pProj{param} = &_AlnprojParamRead (\$AlnprojParamPlain, $pProj, -debug=>$dbg2);

  # retrieve original sequence from the database, check integrity
  unless ($opt{-noDB}) {

    # check integrity of sequences of the alignment entries
    # &_SeqOrigPreload will be called automatically, don't call &_SeqOrigPreload explicitly
    #   here, cause we may be in the situation of re-loading an edited project. But,
    #   what if we want to add a new project for joining? We will have to check all the
    #   original sequences one by one.
    $bTimer and $time = (times)[0];
    push @RemoveAlnEntry, ( &_SeqOrigCheck ($pProj, -dialog=>$opt{-dialog}, -debug=>$dbg2) );
    $bTimer and printf STDERR "%s. CPU time for checking integrity of sequences: %.3f s\n", &MySub, (times)[0] - $time;
  }

  # remove erroneous entries (IDs in @RemoveAlnEntry)
  foreach (@RemoveAlnEntry) {
    for ($CtI=0; $CtI<@{$$pProj{align}}; $CtI++) {
      if ($$pProj{align}[$CtI]{idcomplex} eq $_) {
        $debug and printf STDERR "%s. sequence entry %s removed\n", &MySub,
          $$pProj{align}[$CtI]{idcomplex}||"''";
        splice @{$$pProj{align}}, $CtI, 1;
        last;
      }
    }
    if ($CtI >= @{$$pProj{align}} and $debug) {
      printf STDERR "%s. ERROR: didn't find sequence entry '$_' that's to be removed\n", &MySub;
    }
    $$pProj{changes} = 1;
  }

  # update consensus
  $bTimer and $time = (times)[0];
  &AlnprojConsUpdate ($pProj);

  # nice margins, delete redundant gaps
  # Changes aren't registrated by the 'Changes Monitoring'.
  # Don't use &AlnCompress here! This SUB might be used for reading working
  #   windows with entries having no effective sequence string
  &AlnMargins ($$pProj{align}, -debug=>$dbg2);
  $bTimer and printf "%s. CPU time for making nice project: %.3f s\n", &MySub, (times)[0] - $time;

  # debug protocol for result
  if ($debug) {
    printf STDERR "%s. project after processing:\n", &MySub;
    print  STDERR "  document path: $PathIn\n";
    print  STDERR "  project name: $$pProj{name}\n";
    printf STDERR "  document ante: %d byte\n", length $$pProj{DocAnte};
    printf STDERR "  document post: %d byte\n", length $$pProj{DocPost};
    printf STDERR "  entries: %d\n", int @{$$pProj{align}};
    printf STDERR "  project changed: %s\n", $$pProj{changes} ? 'yes':'no';
    printf STDERR "%s. project entries (preceded by consensus entry):\n", &MySub;
    foreach ($$pProj{consens}, @{$$pProj{align}}) {
      print  STDERR "  $_->{idcomplex} $_->{id} $_->{offset} $_->{orient}  $_->{sequence}\n";
    }
  }

  # exit SUB successfully
  return $pProj;
}


################################################################################
# AlnK format output
################################################################################


# project file format - string definitions
# see on top


# make complex identifiers in alignment data structure
#
# INTERFACE
# - argument 1: reference to alignment data structure.
#               data structure itself is explicitly changed, means update of
#               key 'idcomplex' for each sequence entry.
#
sub AlnprojIdcomplex {
  my ($pAln, %opt) = @_;
  my ($WidthId, $WidthOff);

  # get maximal field lengths
  foreach (@$pAln) {
    $WidthId  = &Max ($WidthId,  length $_->{id});
    $WidthOff = &Max ($WidthOff, length $_->{offset});
  }

  # make complex identifiers
  foreach (@$pAln) {
    $_->{idcomplex} = sprintf ("%-${WidthId}s %${WidthOff}d%s", $_->{id}, $_->{offset}, $_->{orient});
    $_->{idcomplex} =~ s/ /_/g;
  }
}


# format project alignment to Clustal W
#
# INTERFACE
# - argument 1: reference to alignment data
# - options:    all of &AlnClustalSprint with exception of '-KeyId'.
# - return val: Clustal W formatted alignment
#
# DESCRIPTION
# - This function serves as an interface to AlnClustalSprint. The only thing to
#   be done here is the up-to-date construction of field 'idcomplex' for all the
#   alignment entries. Then, &AlnClustalSprint has to be told that 'idcomplex' is
#   the identifier field to be used.
#
sub AlnprojClustalSprint {
  my ($pAln, %opt) = @_;

  # update complex identifiers
  &AlnprojIdcomplex ($pAln);
  $opt{-KeyId} = 'idcomplex';

  # return output
  return &AlnClustalSprint ($pAln, %opt);
}


# print alignment project to plain string
#
# INTERFACE
# - argument 1: reference to project data
#               the project is changed in that respect that the field
#               'idcomplex' is updated for all alignment entries.
#
# - options:
#   -debug      print debug protocol to STDOUT
#
# - return val: project plain text
#
sub AlnprojSprint {
  my ($pProj, %opt) = @_;
  my ($debug, $dbg2, $ProjPlain);

  # function parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;

  # output project header
  $ProjPlain .= sprintf "%s \'%s\'\n", $_LibParam{PhraseProjOff}, $$pProj{name};
  $ProjPlain .= &AlnprojParamSprint ($$pProj{param}, -debug=>$dbg2);
  $ProjPlain .= "\n";

  # output consensus sequence
  $ProjPlain .= &SeqentryToFasta (
    { sequence=>$$pProj{consens}{sequence}, id=>$$pProj{name} },
    -phrase => 'consensus sequence',
    -pure   => 1,
    -debug  => $dbg2);
  $ProjPlain .= "\n";

  # output alignment
  $ProjPlain .= &AlnprojClustalSprint ($$pProj{align},
    '-1block'  => 1,
    -ClipEnd => 1,
    -debug   => $dbg2);

  # output project trailer
  $ProjPlain .= sprintf "%s \'%s\'\n", $_LibParam{PhraseProjEnd}, $$pProj{name};

  # exit SUB
  return $ProjPlain;
}


# output alignment project to file
#
# INTERFACE
# - argument 1: reference to project data
#               the project is changed in that respect that the field
#               'idcomplex' is updated for all alignment entries.
# - argument 2: target file for output (default target: STDOUT)
# - return val: success status (boolean)
#
sub AlnprojToFile {
  my ($pProj, $PathOutput, %opt) = @_;

  # open output file
  $PathOutput = &PathExpand ($PathOutput);
  unless (open (PROJFILE, ">$PathOutput")) {
    print  STDERR "ERROR: unable to open file $opt{-wfile} for output\n";
    return undef;  # return with error
  }

  # output project file
  print PROJFILE $$pProj{DocAnte};
  print PROJFILE &AlnprojSprint ($pProj, %opt);
  print PROJFILE $$pProj{DocPost};

  # close output file, exit SUB successfully
  close PROJFILE;
  return 1;
}


################################################################################
# project parameter I/O
################################################################################


# project parameter defaults:
# see on top


# read project parameters
#
# INTERFACE
# - argument 1: reference to plain text of parameter data
# - argument 2: reference to project data
#
# - options:
#   -debug      [STD]
#
# - return val: reference to project parameter data
#
sub _AlnprojParamRead {
  my ($pPlainParam, $pProj, %opt) = @_;
  my ($debug, $dbg2);
  my ($pAlnprojParam, $AlnSeqType);

  # function parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  $debug and printf STDERR "%s. got following parameter plain text:\n$$pPlainParam", &MySub;

  # parse parameter data from plain text
  $pAlnprojParam = &DataRead ($pPlainParam, -debug=>$dbg2);

  # update syntax of former versions
  if ($$pAlnprojParam{ReferenceDatabaseType} or $$pAlnprojParam{ReferenceDatabasePath}) {
    $$pAlnprojParam{SequenceReference} = [{
      Type => $$pAlnprojParam{ReferenceDatabaseType},
      Path => $$pAlnprojParam{ReferenceDatabasePath},
      }];
    delete $$pAlnprojParam{ReferenceDatabaseType};
    delete $$pAlnprojParam{ReferenceDatabasePath};
    $$pProj{changes} = 1;
  }
  if (($$pAlnprojParam{SequenceReference}[0]{Path}||'') eq "$BlastDefault{DbHome}/dicty.reads") {
    $$pAlnprojParam{SequenceReference}[0] = $_LibParam{ParamDflt}{SequenceReference};
    $$pProj{changes} = 1;
  }
  if (($$pAlnprojParam{SequenceReference}[0]{Type}||'') eq 'Experiment') {
    $$pAlnprojParam{SequenceReference}[0]{Type} = 'ExperDir';
    $$pProj{changes} = 1;
  }

  # diagnose seq type
  unless ($$pAlnprojParam{SeqType}) {
  # this has to be done before {BlastEntry}{Program}
    $AlnSeqType = $$pAlnprojParam{SeqType} =
      &SeqType ($$pProj{align}[0]{sequence}, -debug=>$dbg2);
    # there's no consensus sequence yet, but there should be any sequence entry
    $$pProj{changes} = 1;
  }

  # fill undefined parameters with default values
  unless ($$pAlnprojParam{BlastEntry}) {
    $$pAlnprojParam{BlastEntry} = &DataClone ($_LibParam{ParamDflt}{BlastEntry}{$AlnSeqType});
    $$pProj{changes} = 1;
  }
  unless ($$pAlnprojParam{BlastEntry}{Program}) {
    # BLAST database is assumed to be the same sequence type as the alignment
    #   project
    $$pAlnprojParam{BlastEntry}{Program} = $_LibParam{ParamDflt}{BlastEntry}{$AlnSeqType}{Program};
    $$pProj{changes} = 1;
  }
  unless ($$pAlnprojParam{BlastEntry}{Database}) {
    $$pAlnprojParam{BlastEntry}{Database} = $_LibParam{ParamDflt}{BlastEntry}{$AlnSeqType}{Database};
    $$pProj{changes} = 1;
  }
  if ($$pAlnprojParam{BlastEntry}{ThresholdScore}) {
    $$pAlnprojParam{BlastEntry}{ThresholdExpect} and $$pProj{changes} = 1;
    delete $$pAlnprojParam{BlastEntry}{ThresholdExpect};
  }
  unless ($$pAlnprojParam{BlastEntry}{ThresholdIdentity}) {
    $$pAlnprojParam{BlastEntry}{ThresholdIdentity} = $_LibParam{ParamDflt}{$AlnSeqType}{ThresholdIdentity};
    $$pProj{changes} = 1;
  }
  unless ($$pAlnprojParam{Report}{SymbolAnalysis}) {
    $$pAlnprojParam{Report}{SymbolAnalysis} = &DataClone ($_LibParam{ParamDflt}{Report}{SymbolAnalysis});
    $$pProj{changes} = 1;
  }
  unless (defined $$pAlnprojParam{Report}{SymbolAnalysis}{MaskGapEnd}) {
    $$pAlnprojParam{Report}{SymbolAnalysis}{MaskGapEnd} = $_LibParam{ParamDflt}{Report}{SymbolAnalysis}{MaskGapEnd};
    $$pProj{changes} = 1;
  }
  unless (defined $$pAlnprojParam{Report}{SymbolAnalysis}{MaskGapInside}) {
    $$pAlnprojParam{Report}{SymbolAnalysis}{MaskGapInside} = $_LibParam{ParamDflt}{Report}{SymbolAnalysis}{MaskGapInside};
    $$pProj{changes} = 1;
  }
  unless (defined $$pAlnprojParam{SequenceReference}) {
    $$pAlnprojParam{SequenceReference} = [ $_LibParam{ParamDflt}{SequenceReference} ];
    $$pProj{changes} = 1;
  }

  # debug
  $debug and print  STDERR &_AlnprojParamDebug ($pAlnprojParam);

  # exit SUB
  return $pAlnprojParam;
}


# debug report for project parameters
#
# INTERFACE
# - argument 1: reference to parameter data
# - return val: plain text representation of project parameter data
#
sub _AlnprojParamDebug {
  my ($pParam, %opt) = @_;
  my ($PlainParam);

  # print current parameters
  $PlainParam .= "_AlnprojParamDebug. debug report:\n";
  $PlainParam .= sprintf "file version: %s\n", $$pParam{Version}||"''";
  $PlainParam .= sprintf "latest touch: %s\n", $$pParam{LatestTouch};

  # rewrite of INI data
  $PlainParam .= "_AlnprojParamDebug. rewrite of INI data:\n";
  $PlainParam .= &AlnprojParamSprint ($pParam, -debug=>($opt{-debug}?$opt{-debug}-1:undef));

  # exit SUB
  return $PlainParam;
}


# read project parameters
#
# INTERFACE
# - argument 1: reference to plain text of parameter data
#
# - options:
#   -debug      [STD]
#
# - return val: plain text representation of project parameter data
#
sub AlnprojParamSprint {
  require MemHandle;
  my ($pAlnprojParam, %opt) = @_;
  my ($debug);

  # function parameters
  $debug = $opt{-debug};

  # update of parameter data
  $$pAlnprojParam{LatestTouch} = &TimeStr();
  $$pAlnprojParam{Version}= $_LibParam{Version};
  my $hMem = MemHandle->new();
  &DataPrint ($pAlnprojParam, -handle=>$hMem, -debug=>$debug?$debug-1:undef);

  # print and return plain text
  return $hMem->mem();
}


# (get and) enter project name
#
# INTERFACE
# - argument 1: reference to project data
# - argument 2: optional new name. If no name is given, it's got by a user
#               prompt.
#
# DESCRIPTION
# - the given project name is stored in $$pProj{name}.
#
sub AlnprojName {
  my ($pProj, $NameProj) = @_;

  # get string by command line
  while (! $NameProj) {
    print "please, enter project name\n> ";
    chop ($NameProj = <STDIN>);
    $NameProj =~ tr/^',$ //d;  #'
  }

  # clean and enter name string
  if ($$pProj{name} ne $NameProj) {
    $$pProj{name} = $NameProj;
    $$pProj{changes} = 1;
  }
}


################################################################################
# GAP4 directed assembly I/O
################################################################################


# read alignment project from GAP4 directed assembly
#
# INTERFACE
# - argument 1: path of source file
# - argument 2: reference to array of sequence identifiers (= files)
#
# - options:
#   -debug      [STD]
#   -dialog     enable dialogue for user interaction
#   -noDB       suppress dialogue with the database. Sequences won't be
#               checked for integrity, offset values aren't updated.
#   -timer      [STD], option is handed over to subroutines.
#
# - return val: - reference to project data
#               - undef if an error occurred.
#
# DESCRIPTION
# - check referenced input file for following contents (in this order):
#   - GAP directed assembly
#   - alignment project file or Clustal W alignment (array)
#
# DEBUG, CHANGES, ADDITIONS
# - stretches are not correctly mapped until comparison with originals
#
sub AlnprojGapdirOpen {
  my ($PathDir,$pId,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;

  my @PathExper;
  foreach (@$pId) {
    push @PathExper, "$PathDir/$_";
  }

  # read sequence files to array
  # - don't apply -pure during primary reading of the data
  #   => the gaps are needed for the accurate alignment
  # - apply -ClipQuality during primary reading of the data
  #   The alignment cross-sequence position references refer to the clipped
  #   sequence string.
  my $poSeqQueue = SeqLab::SeqStreamIn->new(@PathExper);
  $poSeqQueue->AddSwitch (-pure=>undef, -ClipQuality=>1, -debug=>$dbg2);
  my $SeqLast=''; my %SeqWait;
  my $pProj={align=>[]};
  my ($pSeq,$CtMove);
  while ($pSeq = ($SeqWait{$SeqLast}||$poSeqQueue->GetNext())) {
    $debug and printf STDERR "%s. sequence $$pSeq{id}\n", &MySub;

    # parse alignment reference for current entry
    # eventually move entry to queue of sequences waiting for precedents
    my ($reference,$strand,$position) = split (/ +/,$$pSeq{AP});
    if ($reference ne '*new*' and $reference ne $SeqLast) {
      $SeqWait{$reference} = $pSeq;
      next;
    }

    # enter alignment entry
    # place sequence string in global storage (pure, upper case)
    $SeqOrig{$$pSeq{id}} = &SeqStrPure($$pSeq{sequence},-upper=>1);
    &AnnotClip($pSeq,-debug=>$dbg2);
    $$pSeq{sequence} =~ tr/a-z/A-Z/;

    # strandedness in alignment
    # stretches are not correctly mapped until comparison with originals
    if ($strand eq '-') {
      $$pSeq{sequence} = &SeqStrRevcompl($$pSeq{sequence});
      $$pSeq{orient} = 'R';
    } else {
      $$pSeq{sequence} = $$pSeq{sequence};
      $$pSeq{orient} = 'F';
    }

    # position in alignment
    # enter gaps at the beginning of the sequence string
    $CtMove += $position||0;
    substr($$pSeq{sequence},0,0) = '-' x $CtMove;

    # offset
    # stretches are not correctly mapped until comparison with originals
    $$pSeq{offset} = 1;

    # enter sequence, change reference for waiting queue
    push @{$$pProj{align}}, $pSeq;
    if($SeqLast){ delete $SeqWait{$SeqLast} }
    $SeqLast = $$pSeq{id};
  }

  # project parameters:
  # - defaults from &_AlnprojParamRead
  # - point 'SequenceReference' to project directory
  $$pProj{param} = &_AlnprojParamRead (\'', $pProj, -debug=>$dbg2);
  $$pProj{param}{SequenceReference} = [{
    Type  => 'ExperDir',
    Path  => $PathDir,
    }];

  # remap positions, check integrity
  &_SeqOrigCheck ($pProj, -dialog=>$opt{-dialog}, -debug=>$dbg2);

  # tidy up, exit SUB
  &AlnprojConsUpdate ($pProj);
  return $pProj;
}


################################################################################
# project operations
################################################################################


# return a clone of a project
#
# INTERFACE
# - argument 1: reference to project data
#
# - options:
#   -debug      [STD]
#
# - return val: reference to project data clone
#
# DESCRIPTION
# - this has very much the same effect as if we would copy with &DataClone.
#   No data selection of refinement will be done.
#
sub AlnprojClone {
  my ($pProjMother, %opt) = @_;
  my ($pProjChild);

  # copy base hash to new structure
  # alignment data structure is cloned by AlnClone
  %{$pProjChild} = %{$pProjMother};
  $$pProjChild{align} = &AlnClone ($$pProjMother{align});

  # renew consensus pointer
  $$pProjChild{consens} = $$pProjChild{align}[0];

  return $pProjChild;
}


# split project into two parts
#
# INTERFACE
# - argument 1: reference to project data
#               the referenced data is left unchanged
# - argument 2: split position, i.e. length of the 5' part
#
# - options:
#   -consens    the split position refers to number of consensus symbols rather
#               than alignment positions
#               The left part will be chosen as big as possible, following gaps
#               in the consensus sequence will result in additional truncation
#               of the alignment segments.
#   -debug      [STD]
#
# - return val: array of references to the two project parts
#
# DESCRIPTION
# - sequence stretch offset values of the derived parts are not updated
#   and are the values of the mother project.
# - Despite a part project may not have a resulting length it's core data is
#   the same as the mother project and there're as much sequence entries
#   as in the mother project (with length zero in that case).
#
# DEVELOPER'S NOTES
# - this function shows similarities to &SeqLab::SeqFormat::SeqRange
#
sub AlnprojSplit {
  my ($pProj,$iSplit,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2  = $debug ? $debug : undef;
  if ($iSplit<0) { die sprintf "%s. argument ERROR, negative split position\n", &MySub; }
  my $iSplitAln=$iSplit;
  if ($opt{-consens}) {
    if ($$pProj{consens}{sequence} =~ m/^-*([a-z]-*){$iSplit}/i) {
      $iSplitAln = length($&);
    } else {
      printf STDERR "%s. FATAL ERROR: regexp match failure for 5' part\n", &MySub;
      printf STDERR "  alignment ref: %s\n", $pProj;
      printf STDERR "  # entries: %d\n", int(@{$$pProj{align}});
      printf STDERR "  consensus ID: %s\n", $$pProj{consens}{id}||"''";
      printf STDERR "  consensus length: %d\n", length($$pProj{consens}{sequence});
      printf STDERR "  \$iSplit: %d -> %d\n", $iSplit, $iSplitAln;
      exit 1;
    }
  }

  # clip right end from 5' part
  my $pProj5end = &AlnprojClone($pProj,-debug=>$dbg2);
  foreach (@{$$pProj5end{align}}) {
    if (length($_->{sequence})<$iSplitAln) { next }
    substr($_->{sequence},$iSplitAln) = '';
  }

  # clip left end from 3' part
  my $pProj3end = &AlnprojClone($pProj,-debug=>$dbg2);
  foreach (@{$$pProj3end{align}}) {
    substr($_->{sequence},0,$iSplitAln) = '';
  }

  # test
  if ($debug) {
    printf STDERR "%s. effective parameters of alignment parts:\n", &MySub;
    printf STDERR "  total length:    %d\n", length($$pProj{consens}{sequence});
    printf STDERR "  # entries full:  %d\n", int(@{$$pProj{align}});
    printf STDERR "  length 5' part:  %d\n", length($$pProj5end{consens}{sequence});
    printf STDERR "  # entries 5':    %d\n", int(@{$$pProj5end{align}});
    printf STDERR "  length 3' part:  %d\n", length($$pProj3end{consens}{sequence});
    printf STDERR "  # entries 3':    %d\n", int(@{$$pProj3end{align}});
    if ($debug>1) {
      print STDERR  &AlnprojClustalSprint($$pProj{align});
      print STDERR  &AlnprojClustalSprint($$pProj5end{align});
      print STDERR  &AlnprojClustalSprint($$pProj3end{align});
    }
  }

  return ($pProj5end,$pProj3end);
}


# split project into three parts: 5' end, core, 3' end
#
# INTERFACE
# - argument 1: reference to project data
# - argument 2: consensus length of the 5' part
# - argument 3: consensus length of the middle part
#
# - options:    see AlnprojSplit
#
# - return val: array of references to the three project parts
#
sub AlnprojSplit3 {
  my ($pProj,$iSplit5,$iSplitM,%opt) = @_;
  if ($opt{-debug}) {
    printf STDERR "%s. Alignment parameters:\n", &MySub;
    printf STDERR "  alignment ref: %s\n", $pProj;
    printf STDERR "  # entries: %d\n", int @{$$pProj{align}};
    printf STDERR "  \$iSplit5: %d\n", $iSplit5;
    printf STDERR "  \$iSplitM: %d\n", $iSplitM;
  }

  # split
  my ($pProj5end,$pProjCore) = &AlnprojSplit ($pProj, $iSplit5, %opt);
  my ($pProj3end);
  ($pProjCore,$pProj3end) = &AlnprojSplit ($pProjCore, $iSplitM, %opt);

  # return parts
  return ($pProj5end,$pProjCore,$pProj3end);
}


# join array of projects to one single project
#
# INTERFACE
# - arguments:  array of references to projects
#               data is left unchanged
#
# - global options:
#   -debug        [STD]
#
# - return val: reference to the new joint project,
#               undef if an error occurred.
#
# DESCRIPTION
# - margins are corrected for each part project.
# - contributing projects doesn't have to have the same number of alignment
#   entries, and the projects doesn't have to share the entries of each
#   other.
# - sorting order of alignment entries of the joint project will be the same
#   as in the first of the contributing projects.
#
sub AlnprojJoin {
  my ($pProjMaster, @ProjAdd) = @_;
  my ($debug, $dbg2);
  my ($LenMaster, $CtProj);
  my ($pProjAdd, $AlnEntryNext);

  # function parameters
  $debug = $main::ProgOpt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;
  unless (@ProjAdd) {
    printf STDERR "%s. ERROR: less than two source projects\n", &MySub;
    return undef;
  }

  # prepare master project
  $pProjMaster = &AlnprojClone ($pProjMaster);
  &AlnMargins ($$pProjMaster{align});
  if ($debug) {
    printf STDERR "%s. project 0: %d entries:\n", &MySub, int @{$$pProjMaster{align}};
    foreach (@{$$pProjMaster{align}}) {
      print  STDERR "  $_->{id}, $_->{orient}, $_->{offset}\n";
    }
  }

  # loop over all projects to be added
  while ($pProjAdd = shift(@ProjAdd)) {
    $CtProj ++;
    $LenMaster = length ($$pProjMaster{align}[0]{sequence});
    $debug and printf STDERR "%s. project %d: %d entries\n", &MySub, $CtProj, int @{$$pProjAdd{align}};

    # loop over all sequence entries of next project
    foreach $AlnEntryNext (@{$$pProjAdd{align}}) {
      my $AlnEntryMaster;

      # look for pair of entries sharing ID (and more?)
      my ($CtPair, $bEquiv, @Found);
      foreach (@{$$pProjMaster{align}}) {

        # pair for same ID, different orientaton
        if ($AlnEntryNext->{id}     eq $_->{id} and
            $AlnEntryNext->{orient} ne $_->{orient}) {
          $CtPair ++;
        }

        # pair for same ID, same orientaton
        elsif ($AlnEntryNext->{id}     eq $_->{id} and
               $AlnEntryNext->{orient} eq $_->{orient}) {
          $CtPair ++;

          # pair of entries fully equivalent?
          if ($AlnEntryNext->{offset} eq $_->{offset}) {
            $AlnEntryMaster = $_;
            $bEquiv = 1;
            $debug and printf STDERR "%s. found stretch pair matching ID & strand & offset, entry %s\n", &MySub, $AlnEntryNext->{id};
            last;
          } else {
            push @Found, $_;
            $debug and printf STDERR "%s. found stretch pair matching ID & strand, entry %s\n", &MySub, $AlnEntryNext->{id};
          }
        }
      }

      # is pair of entries unique?
      if (!$bEquiv and $CtPair==1 and $Found[0]) {
        $AlnEntryMaster = shift (@Found);
        $bEquiv = 1;  # not really needed anymore
      }

      # no unique stretch pair found ? => generate gap in master alignment
      elsif (! $bEquiv) {
        %$AlnEntryMaster = %$AlnEntryNext;
        $AlnEntryMaster->{sequence} = '-' x $LenMaster;
        push @{$$pProjMaster{align}}, $AlnEntryMaster;
        $debug and printf STDERR "%s. gap insert for non-matching entry %s\n", &MySub, $AlnEntryNext->{id};
      } elsif ($debug) {
        printf STDERR "%s. found continuing sequence stretch for entry %s\n", &MySub, $AlnEntryNext->{id};
      }

      # join sequence stretches
      $AlnEntryMaster->{sequence} .= $AlnEntryNext->{sequence};
    }

    # re-nice master project alignment
    &AlnMargins ($$pProjMaster{align});
  }

  # exit SUB
  return $pProjMaster;
}


# delete sequence entry from alignment
#
# INTERFACE
# - argument 1: reference to project data
# - argument 2: identifier of sequence to be deleted
#
# - options:
#   -debug      [STD]
#
# DESCRIPTION
# - The referenced project is explicitly changed.
# - all entries matching to identifier will be deleted (there may be
#   more than just one).
#
# DEBUG, CHANGES, ADDITIONS
# - A regexp may be used instead of an explicit identifier.
#
sub AlnprojEntryDel {
  my ($pProj, $SeqID, %opt) = @_;
  my ($debug, $dbg2);
  my ($bDel, $CtI);

  # function parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;

  # look for matching ID
  for ($CtI=0; $CtI<@{$$pProj{align}}; $CtI++) {
    if ($$pProj{align}[$CtI]{id} eq $SeqID) {

      # delete entry
      splice @{$$pProj{align}}, $CtI, 1;
      $bDel = 1;
    }
  }

  # entry/entries deleted: update consensus, nice alignment
  if ($bDel) {
    &AlnprojConsUpdate ($pProj);
    &AlnCompress ($$pProj{align});
    $$pProj{changes} = 1;
  }

  # nothing has changed
  else {
    $debug and printf STDERR "%s. no entry matching to identifier scheme\n", &MySub;
  }
}


################################################################################
# project sequence operations
################################################################################


# hide all stretch extensions
#
# INTERFACE
# - argument 1: reference to project data (data is explicitly changed).
#
# DESCRIPTION
# - This is desired for the concept of hiding / showing divergent stretch
#   ends in divergent SimClusters.
# - The referenced project data is explicitly changed. Changes aren't
#   registered in the flag $$pProj{changes}, as they are regarded as
#   'layout modifications'.
# - Offset values of the sequence entries should refer to the validated
#   part of the sequence stretch (big letters).
#
sub AlnprojExtensHide {
  my $pProj = shift;

  # delete all lower-case sequence stretches
  foreach (@{$$pProj{align}}) {
    $_->{sequence} =~ s/[a-z]/-/g;
  }

  # renice project
  &AlnCompress ($$pProj{align});

  # project has been changed
  $$pProj{changes} = 1;
}


# validate all stretch extensions
#
# INTERFACE
# - argument 1: reference to project data
#               data is explicitly changed.
#
# DESCRIPTION
# - This is desired for the concept of hiding / showing extensions of stretch
#   ends in divergent SimClusters.
# - Offset values of the sequence entries don't refer correctly to the
#   validated part of the sequence stretches.
#
sub AlnprojExtensValid {
  my ($pProj, %opt) = @_;

  # change lower-case sequence stretches to upper case
  foreach (@{$$pProj{align}}) {
    $_->{sequence} =~ tr/[a-z]/[A-Z]/;
  }

  # update consensus sequence
  &AlnprojConsUpdate ($pProj);

  # project has been changed
  $$pProj{changes} = 1;
}


# update consensus sequence of the project
#
# INTERFACE
# - argument 1: reference to project data
#               the referenced data is explicitly changed
#
# - options:
#   -debug      [STD]
#
# DESCRIPTION
# - note: changes in the consensus sequence are not reported via the
#   $$pProj{changes} flag.
#
sub AlnprojConsUpdate {
  my ($pProj, %opt) = @_;
  my ($debug, $dbg2);
  my ($pAlnTmp);

  # function parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : undef;

  # make sure:
  # - project alignment contains a consensus entry as first entry
  # - working alignment contains all entries but the consensus entry
  @$pAlnTmp = @{$$pProj{align}};
  if ($$pProj{align}[0]{id} =~ m/$reAlnConsens/io) {
    shift (@$pAlnTmp);
  } else {
    unshift @{$$pProj{align}}, {
      id     => 'consensus',
      offset => 1,
      orient => 'F',
      };
  }

  # $$pProj{consens} is made a reference to the consensus sequence entry
  $$pProj{consens} = $$pProj{align}[0];

  # get consensus sequence
  $$pProj{consens}{sequence} = &AlnConsens ($pAlnTmp,
    -MaskGapEnd => $$pProj{param}{Report}{SymbolAnalysis}{MaskGapEnd},
    -MaskGapIns => $$pProj{param}{Report}{SymbolAnalysis}{MaskGapInside},
    -debug      => $dbg2);
}


1;
# $Id: AlnProj.pm,v 1.14 2007/09/29 10:12:22 szafrans Exp $
