################################################################################
#
#  kPerl GSCJ ReadWatch Concept
#  Library for Sequencing Read Management
#
#  copyright (c)
#  Karol Szafranski on behalf of IMB Jena, Dept. Genome Analysis, 1999-2004,
#    szafrans@imb-jena.de
#  Karol Szafranski on behalf of FLI Jena, Genome Analysis Group, 2007,
#    szafrans@fli-leibniz.de
#
################################################################################
#
#  DESCRIPTION
#
# - individual description of functions at the beginning of the code blocks
#
################################################################################
#
#  FUNCTIONS, DATA
#
#   @EXPORT
#
# - reading identifiers
#   %ReadidField
#   &ReadidToFields
#   &ReadTreeIndex
#
################################################################################
#
#  DEBUG, CHANGES, ADDITIONS
#
# - look also for notes in the header of each function block
#
################################################################################

package ReadWatch::Read;

# includes
#use strict; use warnings;  # OK 20040810

# symbol export
use Exporter qw(import);
our @EXPORT = qw (
  %ReadidField &ReadidToFields &ReadTreeIndex
  );


################################################################################
# reading identifiers
################################################################################


# parameters
our (%ReadidField);
$ReadidField{SectionToSuffix} = {
  low  => '_lo',
  high => '_hi',
  };
$ReadidField{SectionToRegexp} = {
  low  => '([a-h]0[1-8]|_lo)',
  high => '([a-h](09|1[0-2])|_hi)',
  };
$ReadidField{prdAnti} = {
  '1'  => '-1',
  '-1' => '1',
  '0'  => '0',
  };
$ReadidField{prd2prm}{GSCJ} = {
  '1'  => 's',
  '-1' => 'r',
  '0'  => 'w',
  };


# split read name to information fields
#
# INTERFACE
# - argument 1: sequence identifier
#
# - options:
#   -clone      use clone phrase only in analysis
#   -debug      print debug protocol to STDERR
#
# - return val: - reference to hash of fields
#               - undef if none of the naming scheme matches
#
# DESCRIPTION
# - explanation of returned fields:
#   chm  sequencing chemistry. The meaning of this shortcut is naming scheme-
#        specific.
#   cln  clone identifier in original naming scheme, cmp. field "coo".
#   coo  plate coordinates, e.g. "a08"
#        letter is forced to lower case, number is forced to 2 digits
#   lib  full library (target + library letter)
#   prd  sequencing direction on template.
#         1       forward
#        -1       reverse
#        0/undef  undetermined
#   prm  primer shortcut. The meaning of this shortcut is naming scheme-
#        specific. Therefore, it's of minor value in downstream code. Better
#        use field "prd" for naming scheme-independent information.
#   rd   concatenation of fields that specify the sequencing reaction:
#        primer + number + chemistry
#   stx  syntax scheme, corresponding to the production source
#   tgt  superior sequencing target. Don't confuse with name of sequencing
#        template - this is usually the clone name.
#   typ  read type: clone name + primer shortcut
#
# DEBUG, CHANGES, ADDITIONS
# - If possible, eliminate use of field "prm" from the downstream code.
#   The primer shortcut semantics is highly specific for the naming scheme.
# - It would be nice to have this function turned to an object library.
#   This would be especially useful for dynamically derived fields like
#   $field{typ} or $field{rd}.
# - Differentiation between plate sections 'high' and 'low' is obsolete in
#   modern-times sequencing technique that is strictly based on 96-well plates.
#
sub ReadidToFields {

  # function constants
  my %re = (
    Baylor  => { clone => '^(II[A-I])([A-Z])([A-Z])(\d)([A-Z])(\d{2,3})(\d{2})A?$',
                 read  => '^(II[A-I])([A-Z])([A-Z])(\d)([A-Z])(\d{2,3})(\d{2})A?$' },
    GSCJ    => { clone => '^(\w+)([a-z])(\d{2,3})([a-p]\d{2}|_lo|_hi)$',
                 read  => '^(\w+)([a-z])(\d{2,3})([a-p]\d{2}|_lo|_hi)\.([fprstw])(\d)([bept])?$' },
    Sanger  => { clone => '^(s\w+)([A-Z])(\d{1,3})([a-p]\d{1,2})$',
                 read  => '^(s\w+)([A-Z])(\d{1,3})([a-p]\d{1,2})\.([pq])(\d)([a-z])([z])?$' },
    default => { clone => '^(.+)$',
                 read  => '^(.+)\.(\w+)$' },
    );
  $re{Baylor}{read} = $re{Baylor}{clone};

  # function parameters
  my ($ReadName,%opt) = @_;
  my $debug = $opt{-debug};
  my $NameCateg = $opt{-clone} ? 'clone' : 'read';

  my %field;
  if (0) { }
  # test splitting read name

  ##############################################################################
  # GSCJ syntax
  # 
  # JC1a10b08.r1b
  # ====
  # library
  #     ==
  #     plate number
  #       ===
  #       coordinates
  #           =
  #           primer
  #            =
  #            number to reach uniqueness
  #             =
  #             chemistry (optional, default "BigOrg")
  elsif ($ReadName =~ m/$re{GSCJ}{$NameCateg}/) {

    # basic information fields
    %field = (
      full => $ReadName,
      stx  => 'GSCJ',  # naming syntax scheme
      tgt  => $1,
      lib0 => $2,      # library letter
      plt  => $3,      # plate
      coo  => $4,
      prm  => $5||'',
      chm  => $7||'',
      num  => $6||'',  # read number
      );

    # rebuild: lib cln typ rd
    $field{lib} = $1 . $2;
    $field{cln} = $field{lib} . $3 . $4;
    $field{typ} = $field{cln} .'.'. $field{prm};
    $field{rd}  = $field{prm} . $field{num} . $field{chm};

    # derived field: chm
    # - default is "BigOrg"
    # - primer "t" correlates with terminator chemistry = "t"
    $field{chm} ||= ($field{prm} eq 't') ? 't' : 'b';

    # derived fields: prd sec
    # sec   plate section ('high'/'low')
    $field{prd}   = ($field{prm} =~ m/[fst]/) ? 1 : 0;
    $field{prd} ||= ($field{prm} =~ m/[r]/)  ? -1 : 0;
    if ($field{coo} =~ m/^$ReadidField{SectionToRegexp}{low}$/) {
      $field{sec} = 'low';
    }
    if ($field{coo} =~ m/^$ReadidField{SectionToRegexp}{high}$/) {
      $field{sec} = 'high';
    }
  }

  ##############################################################################
  # Sanger syntax
  # quite similar to GSCJ
  # 
  # sdicA10b8.q1b
  # ====
  # library
  #     ===
  #     plate number
  #        ==
  #        coordinates
  #           =
  #           primer
  #            =
  #            number to reach uniqueness
  #             =
  #             chemistry

  # split read name
  elsif ($ReadName =~ m/$re{Sanger}{$NameCateg}/) {

    # basic information fields
    %field = (
      full => $ReadName,
      stx  => 'Sanger',  # naming syntax scheme
      tgt  => $1,
      lib0 => $2,        # library letter
      plt  => $3,        # plate
      coo  => lc $4,
      prm  => $5,
      chm  => $7,
      num  => $6,        # read number
      );

    # rebuild: lib cln typ rd
    $field{lib} = $1 . $2;
    $field{cln} = $field{lib} . $3 . $4;
    $field{typ} = $field{cln} .'.'. $field{prm};
    $field{rd}  = $field{prm} . $field{num} . $field{chm};

    # generalise coordinate syntax
    # - lower-case letter
    # - two-digit number
    # - this must FOLLOW assignment of $field{cln}
    if (length($field{coo}) == 2) {
      substr ($field{coo}, 1, 0) = '0';
    }

    # derived fields: prd
    $field{prd}   = ($field{prm} =~ m/[p]/) ?  1 : 0;
    $field{prd} ||= ($field{prm} =~ m/[q]/) ? -1 : 0;
  }

  ##############################################################################
  # Baylor syntax
  # 
  # IICBP1D25501
  # ====
  # library
  #        ===
  #        plate number
  #           ==
  #           coordinates [strictly numerical key]
  #       =         
  #       primer    
  #      =                               
  #      number to reach uniqueness      
  #     =                
  #     chemistry        
  # 
  # IIIAP1D2126A (variant)
  #            =		   
  #            unknown

  # split read name
  elsif ($ReadName =~ m/$re{Baylor}{$NameCateg}/) {

    # basic information fields
    %field = (
      full => $ReadName,
      stx  => 'Baylor',  # naming syntax scheme
      tgt  => $1,
      lib0 => $2,        # library letter
      plt  => $6,        # plate
      coo  => $7,
      prm  => $5,
      num  => $4,        # read number
      chm  => $3,
      );

    # generalise coordinate syntax
    # Baylor has a strictly numeric coordinate system
    # *** implement me ***

    # rebuild: lib cln typ rd
    $field{lib} = $field{tgt} . $field{lib0};
    $field{cln} = $field{lib} . $field{plt} . $field{coo};
    $field{typ} = $field{cln} .'.'. $field{prm};
    $field{rd}  = $field{prm} . $field{num} . $field{chm};

    # derived fields: prd
    $field{prd}   = ($field{prm} =~ m/[D]/) ?  1 : 0;
    $field{prd} ||= ($field{prm} =~ m/[E]/) ? -1 : 0;
  }

  ##############################################################################
  # default syntax
  # try do extract least amount of information while assuming GSCJ naming scheme
  #   (for primer suffix)

  # split read name
  elsif ($ReadName =~ m/$re{default}{$NameCateg}/) {

    # at least have read and clone name
    $field{full} = $ReadName;
    $field{stx} = 'GSCJ-minimal';
    $field{cln} = $1;
    $field{rd}  = $2;

    # differentiate clone information: tgt lib0 lib
    if ($field{cln} =~ m/^([jJ]\w+)([a-z])\d/) {
      $field{tgt} = $1;     # target
      $field{lib0} = $2;    # library letter
      $field{lib} = $field{tgt} . $field{lib0};
    }

    # analyze sequencing information: num chm prd
    if ($field{rd} and $field{rd} =~ m/^([fprstw])(\d)([bept])?$/) {
      $field{prm} = $1;    # target
      $field{chm} = $2;    # library letter
      $field{num} = $3;
      $field{chm} ||= ($field{prm} == 't') ? 't':'b';
      $field{prd}   = ($field{prm} =~ m/[fst]/) ? 1 : 0;
      $field{prd} ||= ($field{prm} =~ m/[r]/) ? -1 : 0;
    } else {
      $field{prm} = $field{rd};
      $field{prd} = 0;
    }

    # syntheses: typ
    if (exists($field{cln}) and $field{cln} and exists($field{prm}) and $field{prm}) {
      $field{typ} = $field{cln} .'.'. $field{prm};
    }
  }

  # no match to read name convention
  else { return undef }

  # clone information only
  if ($opt{-clone}) {
    map { delete $field{$_} } qw(prm prd typ chm num);
  }

  # return result
  return \%field;
}


# create tree index on list of reads
#
# INTERFACE
# - argument 1: reference to read data
#               array of read names or hash using read names as keys
#
# - options:
#   -idref      each hash of the tree structure contains the field 'id'
#               which gives the anonymous hash references some kind
#               of identity
#   -debug      print debug protocol to STDERR
#   -format     specify what type of index is wanted:
#               clone       default index type, used by &ContigStruct:
#                           clone name -> clone end -> primer+number+chem of
#                             read
#                           At the clone end level, field "id" contains the
#                           "read type" string.
#               ClonePrm    clone name -> primer -> number of read
#                           At the primer level, field "id" contains the
#                           "read type" string.
#               CloneStout  clone name -> primer + number of read
#               LibClone    library name -> clone sub-name -> primer ->
#                           number of read
#               ReadType    read type (clone + primer) -> number of read
#
# - return val: - reference to index data structure
#               - undef if an error occurred
#
# DESCRIPTION
# - the creation of the index is based on the analysis of identifier
#   information fields using &ReadidToFields. A read identifier that's
#   not yielding information fields will be omitted from the index.
# - cmp. module ReadWatch::ReadIndex.pm - hat mit dieser Funktion nicht
#   wirklich etwas zu tun.
#
sub ReadTreeIndex {
  my ($DataArg, %opt) = @_;
  my ($debug, $format, $bHash);
  my ($pReadArr, %ReadIdx, $read, $value, $pField);

  # function parameters
  if (ref($DataArg) eq 'HASH') {
    $pReadArr = [ keys %$DataArg ];
    $bHash = 1;
  } elsif (ref($DataArg) eq 'ARRAY') {
    $pReadArr = $DataArg;
    $bHash = 0;
  } else {
    return undef;
  }
  $format = $opt{-format} || 'clone';

  if (0) { }

  # index type clone
  elsif ($format eq 'clone') {
    foreach $read (@$pReadArr) {
      $pField = &ReadidToFields($read) or next;
      unless ($$pField{cln} and defined($$pField{prd}) and $$pField{rd}) { next }
      $value = $bHash ? $$DataArg{$read} : $read;
      $ReadIdx{$$pField{cln}}{$$pField{prd}}{$$pField{rd}} = $value;
      if ($opt{-idref}) {
        $ReadIdx{$$pField{cln}}{id} ||= $$pField{cln};
      }
    }
    return \%ReadIdx;
  }

  # index type ClonePrm
  elsif ($format eq 'ClonePrm') {
    foreach $read (@$pReadArr) {
      $pField = &ReadidToFields($read) or next;
      unless ($$pField{cln} and $$pField{prm} and $$pField{num}) { next }
      $value = $bHash ? $$DataArg{$read} : $read;
      $ReadIdx{$$pField{cln}}{$$pField{prm}}{$$pField{num}} = $value;
      if ($opt{-idref}) {
        $ReadIdx{$$pField{cln}}{id} ||= $$pField{cln};
        $ReadIdx{$$pField{cln}}{$$pField{prm}}{id} ||= $$pField{typ};
      }
    }
    return \%ReadIdx;
  }

  # index type CloneStout
  elsif ($format eq 'CloneStout') {
    foreach $read (@$pReadArr) {
      $pField = &ReadidToFields ($read) or next;
      $value = $bHash ? $$DataArg{$read} : $read;
      $ReadIdx{$$pField{cln}}{$$pField{prm}.$$pField{num}} = $value;
      if ($opt{-idref}) {
        $ReadIdx{$$pField{cln}}{id} ||= $$pField{cln};
      }
    }
    return \%ReadIdx;
  }

  # index type LibClone
  elsif ($format eq 'LibClone') {
    foreach $read (@$pReadArr) {
      $pField = &ReadidToFields ($read) or next;
      $value = $bHash ? $$DataArg{$read} : $read;
      $ReadIdx{$$pField{lib}}{$$pField{plt}.$$pField{coo}}{$$pField{prm}}{$$pField{num}} = $value;
      if ($opt{-idref}) {
        $ReadIdx{$$pField{lib}}{id} ||= $$pField{lib};
        $ReadIdx{$$pField{lib}}{$$pField{plt}.$$pField{coo}}{id} ||= $$pField{cln};
        $ReadIdx{$$pField{lib}}{$$pField{plt}.$$pField{coo}}{$$pField{prm}}{id} ||= $$pField{typ};
      }
    }
    return \%ReadIdx;
  }

  # index type ReadType
  elsif ($format eq 'ReadType') {
    foreach $read (@$pReadArr) {
      $pField = &ReadidToFields ($read) or next;
      $value = $bHash ? $$DataArg{$read} : $read;
      $ReadIdx{$$pField{typ}}{$$pField{num}} = $value;
      if ($opt{-idref}) {
        $ReadIdx{$$pField{typ}}{id} ||= $$pField{typ};
      }
    }
    return \%ReadIdx;
  }

  # unknown index type
  else {
    return undef;
  }
}


1;
# $Id: Read.pm,v 1.9 2007/09/05 17:04:07 szafrans Exp $
