################################################################################
#
#  kPerl Core Library Group
#  Library for Strings and Regexps
#
#  copyright (c)
#  Karol Szafranski at Inst. Physiol. Chem. Dsseldorf, 1997-1998
#  Karol Szafranski on behalf of IMB Jena, Dept. Genome Analysis, 1998-2002,2004,
#    szafrans@imb-jena.de
#  Karol Szafranski at UPenn Philadelphia, Center for Bioinformatics, 2004,
#    karol@pcbi.upenn.edu
#  Karol Szafranski on behalf of FLI Jena, Genome Analysis Group, 2006,
#    szafrans@fli-leibniz.de
#
################################################################################
#
#  DESCRIPTION
#
# - individual description of functions at the beginning of the code blocks
#
################################################################################
#
#  FUNCTIONS, DATA
#
#   @EXPORT
#
# - pack, unpack
#   @W64
#   %idxW64
#   &packW64
#   &unpackW64
#   &packWX
#
# - text line formatting
#   $sEndl
#   $reEndl  ('|', but no brackets!)
#   &LineFeedUnix
#   &LineBreak
#   &NumFormat
#
# - time formatting
#   $TimeStrDft
#   $TimeStrComp
#   &TimeStr
#
# - regexp housekeeping
#   &RegexpEncode
#
# - regexp matching
#   &MatchFirstPos
#   &MatchCt
#   &MatchArray
#   &MatchIdx
#
#
#  STD OPTIONS
#
#   -debug      print debug protocol to STDERR
#
################################################################################

package MainLib::StrRegexp;

# includes
use strict; #use warnings;  # OK 20060209
use POSIX;
# we cannot use MainLib::Data here - it would could cause cross-include!
# we cannot use MainLib::Misc here - it would could cause cross-include!

# symbol export
use Exporter qw(import);
our @EXPORT = qw (
  @W64 %idxW64 &packW64 &unpackW64 &packWX
  $sEndl $reEndl &LineFeedUnix &LineBreak &NumFormat
  &TimeStr
  &RegexpEncode
  &MatchFirstPos &MatchCt &MatchArray &MatchIdx
  );


################################################################################
# pack, unpack
################################################################################


# a 64-letter alphabet, ordered different from the Base64 alphabet
# cf. http://www.freesoft.org/CIE/RFC/1521/7.htm
our @W64 = (0..9, 'a'..'z', 'A'..'Z', '+', '/');
my $CtW64=0;
our %idxW64 = map { ($_=>$CtW64++) } @W64;
# backward compatibility with former coding schemes
$idxW64{'_'} = 62;
$idxW64{','} = 63;


# pack integer to string of characters, each representing a 6 bit slice
#
# INTERFACE
# - argument 1: integer number
#
# - options:
#   -size       minimum length of string, possibly with leading zeros
#
# - return val: number packed to string
#
# - global data:
#   @W64        6 bit to character translation
#
# DESCRIPTION
# - WARNING: unless option -size=>1 is specified, the string equivalent for
#   number 0 will be an empty string.
#
sub packW64 {
  my ($num,%opt) = @_;
  my $strW64 = '';
  
  while ($num) {
    my $slice64 = $num % 64;
    substr($strW64,0,0) = $W64[$slice64];
    $num = int ($num/64);
  }
  if ($opt{-size}) {
    substr($strW64,0,0) = $W64[0] x ($opt{-size}-length($strW64));
  }
  return $strW64;
}


# unpack 6-bit-packed string representation of a number to integer
#
# INTERFACE
# - argument 1: string
# - return val: integer
#
# - global data:
#   %idxW64     character to 6 bit translation
#
sub unpackW64 {
  my ($strW64, %opt) = @_;
  my ($num);
  foreach (split //, $strW64) {
    $num *= 64;
    $num += $idxW64{$_};
  }
  return $num;
}


# pack integer to string of characters, custom character set
#
# INTERFACE
# - argument 1: integer number
# - argument 2: reference to array of characters (= character set)
#
# - options:
#   -size       minimum length of string
#
# - return val: number packed to string
#
# DESCRIPTION
# - WARNING: unless option -size=>1 is specified, the string equivalent for
#   number 0 will be an empty string.
#
sub packWX {
  my ($num, $pChar, %opt) = @_;
  my $str = '';
  my $mod = int (@$pChar) or return undef;
  
  while ($num) {
    my $slice = $num % $mod;
    substr($str,0,0) = $$pChar[$slice];
    $num = int ($num/$mod);
  }
  if ($opt{-size}) {
    substr($str,0,0) = $$pChar[0] x ($opt{-size}-length($str));
  }
  return $str;
}


################################################################################
# text line formatting
################################################################################


# machine-specific line feed
# regexp for line feed
#
our ($sEndl, $reEndl);
$sEndl = "\n";
$reEndl = '\r\n|\n\r?';


# convert line feed representations from all platforms to UNIX style
#
sub LineFeedUnix {
  my $doc = shift;
  $doc =~ s/\r\n|\n\r|\r/\n/g;
  return $doc;
}


# introduce line breaks in floating text
#
# INTERFACE
# - argument 1: text string
# - options:
#   -length     maximum length of line, default: 80 characters
#   -rm         remove all existing line breaks
# - return val: formatted text string
#
# DESCRIPTION
# - A trailing line break is added to the document string. But, it is ensured
#   that there will be only one single line break.
#
sub LineBreak {
  my ($doc,%opt) = @_;
  my $llen = $opt{-length} || 80;
  if (! defined($doc)) { $doc='' }

  # remove all existing line breaks
  if ($opt{-rm}) {
    $doc = &LineFeedUnix ($doc);
    $doc =~ s/\s*\n+\s*/ /g;
    pos($doc) = 0;
  }
  $doc =~ s/\s+$//g;

  # reformat text
  while ($doc =~ m/(.{1,$llen})($|\s+)/g) {
    my $pos = length($`) + length($1) + 1;
    $doc = "$`$1\n$'";
    pos($doc) = $pos;
  }

  # return formatted text
  return $doc;
}


# format number
#
# INTERFACE
# - argument 1: number to be formatted (english format)
#
# - options:
#   -CommaPre   introduce comma every 3 digits before the decimal point.
#               Value is the comma character to be introduced.
#   -CommaPost  introduce comma every 3 digits after the decimal point.
#               Value is the comma character to be introduced.
#
# - return val: formatted number
#
sub NumFormat {
  my ($Number, %opt) = @_;
  my ($NumberPre, $NumberPost);

  # split at the decimal point
  ($NumberPre, $NumberPost) = split /\./, $Number;

  # comma before the decimal point
  if ($opt{-CommaPre}) {
    while ($NumberPre =~ m/(\d)(\d{3})(${opt{-CommaPre}}|$)/) {
      $NumberPre = $` . $1 . $opt{-CommaPre} . $2 . $3 . $';
    }
  }

  # rebuild number
  $Number = $NumberPre . ($NumberPost ? '.' : '') . $NumberPost;
  return $Number;
}


################################################################################
# time formatting
################################################################################

# "constant" definition for strftime() format
# true constant definition would not allow to display the package settings in
#   front-end modules
our $TimeStrDft = '%Y-%m-%d %H:%M:%S %z(%Z)';
our $TimeStrComp = '%Y%m%d%H%M';

# return time in personal standard format
#
# INTERFACE
# - options:
#   -format  formatting of the time string. The syntax
#            is exactly as for the UNIX's strftime function.
#            special definitions:
#              CompactStd   "%Y-%m-%d %H:%M:%S %z(%Z)" (default)
#                           defined in global $TimeStrDft
#              CompactComp  "%Y%m%d%H%M"
#                           defined in global $TimeStrComp
#   -time    supply time value as seconds since epoch (Jan. 1, 1970, UTC),
#            default: current time
#   -zone    display time in specified time zone
#            local   local time zone (default)
#            GMT     Greenwich Mean Time
#            *** not implemented yet ***:
#                    it would not work with &POSIX::strftime
#            MET     Middle European Time
#            [+-]\d  any time zone relative to GMT
#
# DESCRIPTION
# - system's strftime().
#
sub TimeStr {
  my (%opt) = @_;
  my $debug = $opt{-debug};

  # function parameters
  $debug and printf STDERR "%s. using format '%s'\n", (caller(1))[3], $opt{-format};
  my $StrZone = $opt{-zone} || 'GMT';
    # *** not fully implemented ***

  # which time to determine?
  my $TimeSec = $opt{-time} || time;

  # process time format specials
  if (! $opt{-format} or $opt{-format} eq 'CompactStd') {
    $opt{-format} = $TimeStrDft;
  } elsif ($opt{-format} eq 'CompactComp') {
    $opt{-format} = $TimeStrComp;
  }

  # return from SUB
  return &POSIX::strftime ($opt{-format},
    (exists $opt{-zone} and $opt{-zone} eq 'GMT') ?
      gmtime ($TimeSec) : localtime ($TimeSec));
}


################################################################################
# regexp housekeeping
################################################################################


# work out regular expression from search string
#
# INTERFACE
# - argument 1: string to be converted
# - options:
#   -char       supply extra characters to get backslashed
#               These may be the framing characters in regexp syntax, e.g.
#               '/', that are not backslashed by default.
# - return val: converted string, regular expression
#
# DESCRIPTION
# - special characters are converted to the non-meaningful form to join a
#   regular expression.
#
sub RegexpEncode {
  my ($str, %opt) = @_;
  $str =~ s/[\.\+\*\^\$\(\)\[\]\{\}\|$opt{-char}]/\\$&/g;
  return $str;
}


################################################################################
# regexp matching
################################################################################


# get position of first RegExp match in string
#
# INTERFACE
# - argument 1: string to search in
# - argument 2: regexp to search for
# - return val: - string position of first occurrence
#               - -1 for no occurrence at all
#
sub MatchFirstPos {
  my ($StrTpl, $regexp) = @_;

  # match and determine position
  if ($StrTpl =~ m/$regexp/mg) {
    return length ($`);
  } else {
    return -1;
  }
}


# count frequency of a regexp match in a string
#
# INTERFACE
# - argument 1: reference to string, template of search
# - argument 2: regexp
# - return val: number of occurrences
#
# DESCRIPTION
# - this code is useful if you need a quick way to count regexp matches
#   and you don't care about time/space performance. Note that there's
#   a disadvantange coming up with large-scale data, esp. with high-multiple
#   search using the same regexp: time performance drops because the
#   regexp expression which is containing a string variable is recompiled
#   for every search run. However, when benchmarking this, i've seen minimal
#   performance drop versus an in-caller approach like this:
#   $CtMatch = do { $_=0; while($StrTpl =~ m/$StrRegexp/mog) {$_++;}; $_;};
#   It might be even slower - don't know why.
#
# DEBUG, CHANGES, ADDITIONS
# - see &MatchArray
#
sub MatchCt {
  my ($pStrTpl, $regexp) = @_;
  my ($CtInstance);

  # loop over matches
  while ($$pStrTpl =~ m/$regexp/mg) {
    $CtInstance ++;
  }
#  pos ($$StrTpl) = -1;

  # exit SUB
  return $CtInstance;
}


# get array of match instances
#
# INTERFACE
# - argument 1: reference to string, template of search
# - argument 2: regexp
# - return val: array of match instances
#
# DEBUG, CHANGES, ADDITIONS
# - we might need option -overlap in order to find overlapping
#   matches. Normally, the next possible match follows after the match.
#   But, there might be cases where a next instance can start
#   inside the last match.
#
sub MatchArray {
  my $pos = -1;
  my ($pStrTpl, $regexp) = @_;
  my (@instance);

  # loop over matches
  while ($$pStrTpl =~ m/$regexp/mg) {
    push @instance, $&;
  }
#  pos ($$pStrTpl) = -1;

  # exit SUB
  return @instance;
}


# get array of match positions/line positions/instances
#
# INTERFACE
# - argument 1: reference to string, template of search
# - argument 2: regexp
# - return val: - reference to array of matches, each represented by a hash:
#                 pos       string position of match offset
#                 lpos      line number of match offset
#                 instance  matching substring
#
# DEBUG, CHANGES, ADDITIONS
# - see &MatchArray
#
sub MatchIdx {
  my ($pStrTpl, $regexp) = @_;
  my (@idx, $PosLast, $LposLast, $sMatch);

  # function parameters
  $PosLast = 0;
  $LposLast = 1;

  # loop over matches
  while ($$pStrTpl =~ m/$regexp/g) {

    # enter match - pure positional data
    push @idx, {
      'pos'      => pos ($$pStrTpl),
      'lpos'     => &MatchCt (\(substr($$pStrTpl, $PosLast, pos ($$pStrTpl) - $PosLast)), '\n') + $LposLast,
      'instance' => $sMatch = $&,
      };

    # enter match
    $PosLast = pos ($$pStrTpl) = $idx[-1]{pos};
    $LposLast = $idx[-1]{lpos};

    # refine positional data
    $idx[-1]{pos} += - length ($sMatch) + 1;
    $idx[-1]{lpos} -= &MatchCt (\$sMatch, '\n');
  }
#  pos ($$StrTpl) = -1;

  # exit SUB
  return \@idx;
}


1;
# $Id: StrRegexp.pm,v 1.14 2008/06/11 08:44:58 szafrans Exp $
