################################################################################
#
#  kPerl Sequence Alignment Laboratory
#  Interface Library for exalin Binary and Its Report
#
#  copyright (c)
#    Fritz Lipmann Institute Jena, Genome Analysis Group, 2010
#  authors
#    Andreas Petzold, andpet@fli-leibniz.de
#    Karol Szafranski, karol.szafranski@leibniz-fli.de
#
################################################################################
#
#  DESCRIPTION
#
# - individual description of functions at the beginning of the code blocks
#
################################################################################
#
#  FUNCTIONS, DATA
#
#   @EXPORT
#
# - all that is
#   &Exalin
#   &get_Exalin
#
################################################################################

package SeqAlign::Exalin;

# includes
use strict; #use warnings;  # OK 1970xxxx
use FileHandle;
use MainLib::Path qw(%CorePath);
use MainLib::Data qw(&DataPrint);
use SeqLab::SeqFormat qw(&SeqentryPopFasta);

# symbol export
use Exporter qw(import);
our @EXPORT = qw (
  &Exalin &get_Exalin
  );


################################################################################
# all that is
################################################################################


# interface to exalin
#
# INTERFACE
# - argument 1: name of single sequence file cont. genomic sequence
# - argument 2: name of single sequence file cont. mRNA sequence
#
# - options:
#   -debug      ...
#   -ParseRslt  parsing ready exalin output (argument 1)
#
# - return val: - result structure, array of hashes each containing:
#                 pos      feature position (ungapped sequence)
#                 length   feature length
#                 feature  feature specifier
#               - undef if an error occurred
#
sub Exalin {
  my ($SeqFileGenome,$SeqFileRna,%opt) = @_;
  my $debug = $opt{-debug};

  # interface to file/call
  my $h;
  if ($opt{-ParseRslt}) {
    $h = FileHandle->new($SeqFileGenome,'r');
  } else {
    $h = FileHandle->new("$CorePath{call}{exalin} -genome $SeqFileGenome -est $SeqFileRna -space 500000 -align |",'r');
  }
  unless ($h) {
    die sprintf "%s. ERROR when invoking $CorePath{call}{exalin} output\n", (caller(0))[3];
  }
  my $buffer;
  while (<$h>) { $buffer.=$_; if(m/^EXALIN/){ last } }

  # loop for parsing alignment block
  my (%parse,@result);
  while ($buffer =~ m/\n( *\S+ +\d+ +)([a-zA-Z\-\.]+) +\d+\n([ 0-9\.\|\<\>\?-]+)\n *(\S+) +(\d+) +([A-Z\-\.]+) +\d+\n/gs) {
    $parse{RnaID} ||= $4;
    $parse{RnaOff} ||= $5;

    push @{$parse{genome}}, $2;
    push @{$parse{map}}, substr ($3, length $1);
    push @{$parse{rna}}, $6;
  }
  my %str = (
    genome => join('',@{$parse{genome}}),
    map    => join('',@{$parse{map}}),
    rna    => join('',@{$parse{rna}}),
    );
  if (not $str{genome}){ return undef }

  # do we have to reverse-complement the RNA?
  my $bReverse=0;
  my $RnaLen;
  if ($buffer =~ m/reversed est/) {
    $bReverse=1;
    my $pRna = &SeqentryPopFasta ($SeqFileRna, -pure=>1);
    $RnaLen = length $$pRna{sequence};
  }
  if ($debug) {
    printf STDERR "%s. parsing result:\n", (caller(0))[3];
    print  STDERR "  RNA ID: $parse{RnaID}\n";
    print  STDERR "  RNA offset: $parse{RnaOff}\n";
    print  STDERR "  genome  $str{genome}\n";
    print  STDERR "  map     $str{map}\n";
    print  STDERR "  RNA     $str{rna}\n";
  }
  unless ($parse{RnaOff}) {
    printf STDERR "%s. ERROR: unable to find offset for RNA seq %s, exalin says:\n%s", (caller(0))[3],
      $parse{RnaID}||"''", $buffer;
    return undef;
  }
  undef $buffer;

  # translate alignment data into feature data
  while ($str{rna} =~ m/(-+)(\.{3,})/) {
    $str{rna} = $` . ('N' x length($1)) . $2 . $';
  }
  while ($str{rna} =~ m/(\.{3,})(-+)/) {
    $str{rna} = $` . $1 . ('N' x length($2)) . $';
  }
  while ($str{rna} =~ m/(N+)(\.{3,})(N+)/ and length($1) + length($3) >= 10) {
    $str{rna} = $` . $2 . $1 . $3 . $';
  }
  while ($str{rna} =~ m/(N{10,}\.{3,}|\.{3,}N{10,})/g) {
    $buffer = $`;
    $str{label} = substr ($str{map}, length($`), length($&));
    $str{label} =~ s/[^0-9]//g;
    $buffer =~ s/[.-]//g;
    my $pos = length($buffer);
    push @result, {
      'pos'     => $pos + $parse{RnaOff} - 1,
      'length'  => $str{label},
      'feature' => 'ReadGap',
      };
  }
  while ($str{rna} =~ m/([A-MO-Z-]N{0,9})\.{3,}N{0,9}[A-MO-Z-]/g) {
    $buffer = $` . $1;
    $str{label} = substr ($str{map}, length($`) + length($1), length($&));
    $str{label} =~ s/[^0-9]//g;
    $buffer =~ s/[.-]//g;
    my $pos = length($buffer);
    push @result, {
      'pos'     => $pos + $parse{RnaOff} - 1,
      'length'  => $str{label},
      'feature' => 'intron',
      };
  }

  # sort features according to their positions
  # eventually reverse-complement the RNA
  if ($bReverse) {
    map {
      $_->{pos} = $RnaLen - $_->{pos};
    } @result;
  }
  @result = sort { $$a{pos} <=> $$b{pos} or -1; } @result;
  $debug and &DataPrint (\@result, -handle=>\*STDERR);

  return \@result;
}


sub get_Exalin {
	my ($fh) = @_;
	my %result = ();
	my $got_record = 0;
	my @vals = ();
	
	while(<$fh>){
		# start a record
		if(m/^EXALIN/){
			$got_record = 1;
			$result{RESULT_STR} = "";
			$result{GENOME_STR} = "";
			$result{EST_STR} = "";
			$result{EXON} = [];
		}
		# est
		elsif(m/^T-SEQUENCE: (\S+)/){
			$result{EST} = $1;
		}
		# est_len
		elsif(m/^T-LENGTH: (\S+)/){
			$result{EST_LEN} = $1;
		}
		# genome
		elsif(m/^G-SEQUENCE: (\S+)/){
			$result{GENOME} = $1;
			
		}
		# genome_len
		elsif(m/^G-LENGTH: (\S+)/){
			$result{GENOME_LEN} = $1;
		}
		# score
		elsif(m/^SCORE: (\S+)/){
			$result{SCORE} = $1;
		}
		# splicing direction
		elsif(m/^SPLICE-DIR: (\S+)/){
			$result{SPLICE_DIR} = $1;
		}
		# strand
		elsif(m/^T-STRAND: (\S+)/){
			$result{STRAND} = $1;
		}
		# exon
		elsif(m/^EXON/){
			@vals = split(/[-\(\)\s]+/,$_);
			$vals[8]=~s/%//;
			if($vals[2]<$vals[1]){
				($vals[1],$vals[2]) = ($vals[2],$vals[1]);
			}
			$result{EXON} ||= [];
			push(@{$result{EXON}},{
				'EST_START'=>$vals[1]-1,'EST_END'=>$vals[2]-1,
				'GENOME_START'=>$vals[3]-1,'GENOME_END'=>$vals[4]-1,
				'SCORE'=>$vals[5],'SCORE_DONOR'=>$vals[6],'SCORE_ACC'=>$vals[7],
				'IDENTITY'=>$vals[8]
				});
		}
		# est align string
		elsif(m/^T:\s+(\S+)/){
			$result{EST_STR} .= $1;	
		}
		# genome align string
		elsif(m/^G:\s+(\S+)/){
			$result{GENOME_STR} .= $1;
		}
		# end a record
		elsif(m/^EXIT/ && $got_record){
			$result{RESULT_STR} .= $_;
			$result{RESULT_STR} .= "\n";
			last;
		}
		
		# store alignment string
		if($got_record){
			$result{RESULT_STR} .= $_;
		}
	}
	
	if (!$got_record or !$result{GENOME} or !$result{EST}) { return undef }	
	
	# for psl formats
	foreach my $tag (qw(MATCH MISMATCH REP NCOUNT EST_INSERTS EST_INSERTBP GENOME_INSERTS GENOME_INSERTBP)){
		$result{$tag} = 0;
	}
	
	if (int(@{$result{EXON}})>0) {
		my @align_gen = split(//,$result{GENOME_STR});
		my @align_est = split(//,$result{EST_STR});
		my $e = 0;
		my $in_exon = 1;
		my $exon_start = 0;
		my $i = 0;
		
		
		for($i = 0;$i<=$#align_gen;$i++){
			# first exon - intron
			if($in_exon && $align_gen[$i] eq "."){
				$in_exon = 0;
				$result{EXON}->[$e]->{ALIGN_LEN} = $i-$exon_start;
				$result{EXON}->[$e]->{GENOME_STR} =  substr($result{'GENOME_STR'},$exon_start,$result{EXON}->[$e]->{ALIGN_LEN});
				$result{EXON}->[$e]->{EST_STR} =  substr($result{'EST_STR'},$exon_start,$result{EXON}->[$e]->{ALIGN_LEN});
				
			}elsif(!$in_exon && $align_gen[$i] ne "."){
				$in_exon = 1;
				$exon_start = $i;
				$e++;
			}
			
			if($align_gen[$i] ne "."){
				if($align_gen[$i] eq "-"){
					$result{EST_INSERTBP}++;
					unless($i>0 && $align_gen[$i-1] eq "-"){
						$result{EST_INSERTS}++;
					}
				}elsif($align_est[$i] eq "-"){
					$result{GENOME_INSERTBP}++;
					unless($i>0 && $align_est[$i-1] eq "-"){
						$result{GENOME_INSERTS}++;
					}
					
				}else{
					if($align_gen[$i] eq "N" or $align_est[$i] eq "N"){$result{NCOUNTS}++;}
					elsif($align_gen[$i] ne $align_est[$i]){$result{MISMATCH}++;}
					else{
						$result{MATCH}++;
					}
				}
				
			}
		}
		
		# last exon
		$result{EXON}->[-1]->{ALIGN_LEN} = $#align_gen-$exon_start+1;
		$result{EXON}->[-1]->{GENOME_STR} =  substr($result{'GENOME_STR'},$exon_start,$result{EXON}->[-1]->{ALIGN_LEN});
		$result{EXON}->[-1]->{EST_STR} =  substr($result{'EST_STR'},$exon_start,$result{EXON}->[-1]->{ALIGN_LEN});
		
		if($result{STRAND} eq "-"){
			$result{EST_START} = $result{EXON}->[-1]->{EST_START};
			$result{EST_END} = $result{EXON}->[0]->{EST_END};
			
		}else{
			$result{EST_START} = $result{EXON}->[0]->{EST_START};
			$result{EST_END} = $result{EXON}->[-1]->{EST_END};
		}
		$result{GENOME_START} = $result{EXON}->[0]->{GENOME_START};
		$result{GENOME_END} = $result{EXON}->[-1]->{GENOME_END};
	}
	
	return \%result;
}


1;
# $Id: Exalin.pm,v 1.4 2018/06/05 18:02:56 szafrans Exp $
