################################################################################
#
#  kPerl Sequence Alignment Laboratory
#  Interface Library for est2genome Binary, Part of the EMBOSS Package
#
#  copyright (c)
#  Karol Szafranski on behalf of IMB Jena, Dept. Genome Analysis, 2000,2002,
#    szafrans@imb-jena.de
#
################################################################################
#
#  DESCRIPTION
#
# - individual description of functions at the beginning of the code blocks
#
################################################################################
#
#  FUNCTIONS, DATA
#
#   @EXPORT
#
# - all that is
#   &EstGenomeV20  (not exported)
#   &EstGenome
#
################################################################################

package SeqAlign::EstGenome;

# includes
#use strict; use warnings;  # OK 2003xxxx
use MainLib::Path qw(%CorePath);
use MainLib::File qw(&ReadFile);
use MainLib::Data qw(&DataPrint);
use SeqLab::SeqFormat qw(&SeqentryPopFasta);

# symbol export
use Exporter qw(import);
our @EXPORT = qw (
  &EstGenome
  );


################################################################################
# all that is
################################################################################


# interface to est2genome
#
# INTERFACE
# - argument 1: name of single sequence file cont. genomic sequence
# - argument 2: name of single sequence file cont. mRNA sequence
#
# - options:
#   -debug      ...
#   -ParseRslt  parsing ready est2genome output (argument 1)
#
# - return val: - result structure, array of hashes each containing:
#                 pos      feature position (ungapped sequence)
#                 length   feature length
#                 feature  feature specifier
#               - undef if an error occurred
#
# DESCRIPTION
# - call "est_genome -genome xz -est yz -space 500000 -align" produces
#   something like this:
#   ... [to follow]
#
sub EstGenomeV20 {
  my ($SeqFileGenome,$SeqFileRna,%opt) = @_;
  my $debug = $opt{-debug};
  my $dbg2  = $debug ? $debug-1 : undef;

  # BLAST program stated
  my $buffer;
  if ($opt{-ParseRslt}) {
    $buffer = &ReadFile ($SeqFileGenome);
  } else {
    $buffer = &ReadFile ("$CorePath{call}{est2genome} -genome $SeqFileGenome -est $SeqFileRna -space 500000 -align |");
  }
  unless ($buffer) {
    die sprintf "%s. ERROR when invoking $CorePath{call}{est2genome} output\n", (caller(0))[3];
  }

  # parsing alignment block loop
  my (%parse,%str);
  my ($bReverse,$RnaLen,@result);
  while ($buffer =~ m/\n( *\S+ +\d+ +)([a-zA-Z\-\.]+) +\d+\n([ 0-9\.\|\<\>\?-]+)\n *(\S+) +(\d+) +([A-Z\-\.]+) +\d+\n/gs) {
#    $debug and print STDERR '', map { "$_\n" } $1, $2, $3, $4, $5, $6, $7;
    $parse{RnaID} ||= $4;
    $parse{RnaOff} ||= $5;

    push @{$parse{genome}}, $2;
    push @{$parse{map}}, substr ($3, length $1);
    push @{$parse{rna}}, $6;
  }
  $str{genome} = join ('', @{$parse{genome}});
  $str{map}    = join ('', @{$parse{map}});
  $str{rna}    = join ('', @{$parse{rna}});
  $str{genome} or return undef;

  # do we have to reverse-complement the RNA?
  if ($buffer =~ m/reversed est/) {
    $bReverse= 1;
    my $pRna = &SeqentryPopFasta ($SeqFileRna, -pure=>1);
    $RnaLen = length $$pRna{sequence};
  }
  if ($debug) {
    printf STDERR "%s. parsing result:\n", (caller(0))[3];
    print  STDERR "  RNA ID: $parse{RnaID}\n";
    print  STDERR "  RNA offset: $parse{RnaOff}\n";
    print  STDERR "  genome  $str{genome}\n";
    print  STDERR "  map     $str{map}\n";
    print  STDERR "  RNA     $str{rna}\n";
  }
  unless ($parse{RnaOff}) {
    printf STDERR "%s. ERROR: unable to find offset for RNA seq %s, est2genome says:\n%s", (caller(0))[3],
      $parse{RnaID}||"''", $buffer;
    return undef;
  }
  undef $buffer;

  # translate alignment data into feature data
  while ($str{rna} =~ m/(-+)(\.{3,})/) {
    $str{rna} = $` . ('N' x length($1)) . $2 . $';
  }
  while ($str{rna} =~ m/(\.{3,})(-+)/) {
    $str{rna} = $` . $1 . ('N' x length($2)) . $';
  }
  while ($str{rna} =~ m/(N+)(\.{3,})(N+)/ and length($1) + length($3) >= 10) {
    $str{rna} = $` . $2 . $1 . $3 . $';
  }
  while ($str{rna} =~ m/(N{10,}\.{3,}|\.{3,}N{10,})/g) {
    $buffer = $`;
    $str{label} = substr ($str{map}, length($`), length($&));
    $str{label} =~ s/[^0-9]//g;
    $buffer =~ s/[.-]//g;
    my $pos = length($buffer);
    push @result, {
      'pos'     => $pos + $parse{RnaOff} - 1,
      'length'  => $str{label},
      'feature' => 'ReadGap',
      };
  }
  while ($str{rna} =~ m/([A-MO-Z-]N{0,9})\.{3,}N{0,9}[A-MO-Z-]/g) {
    $buffer = $` . $1;
    $str{label} = substr ($str{map}, length($`) + length($1), length($&));
    $str{label} =~ s/[^0-9]//g;
    $buffer =~ s/[.-]//g;
    my $pos = length($buffer);
    push @result, {
      'pos'     => $pos + $parse{RnaOff} - 1,
      'length'  => $str{label},
      'feature' => 'intron',
      };
  }

  # sort features according to their positions
  # eventually reverse-complement the RNA
  if ($bReverse) {
    map {
      $_->{pos} = $RnaLen - $_->{pos};
    } @result;
  }
  @result = sort { $$a{pos} <=> $$b{pos} or -1; } @result;
  $debug and &DataPrint (\@result, -handle=>\*STDERR);

  # exit SUB
  return \@result;
}


# appropriate function for current est2genome version
#
sub EstGenome {
  return &EstGenomeV20(@_);
}


1;
# $Id: EstGenome.pm,v 1.7 2007/09/05 15:35:00 szafrans Exp $
