################################################################################
#
#  kPerl Database Concepts
#  Library for Plain Text Table Procedures
#
#  copyright (c)
#    Institute of Molecular Biotechnology Jena, Dept. Genome Analysis, 1998-2002
#    Karol Szafranski and Inst. Physiol. Chem., Univ. Dsseldorf, 1998
#  author
#    Karol Szafranski, karol.szafranski@leibniz-fli.de
#
################################################################################
#
#  DESCRIPTION
#
# - individual description of functions at the beginning of the code blocks
#
# - table file delimiter concepts
#
#   RetHead   Every field value followed by a line feed. Starting paragraph
#             of field labels followed by an additional line feed. As a
#             consequence, each column must have a label.
#   RetX      Every field value followed by a line feed. Datasets have a
#             fixed number of field values specified by the parameter 'X'
#             in 'RetX'.
#   SpaceRet  at least one ' ' as delimiter between the fields of each
#             line.
#   TabRet    standard TAB-delimited files, default concept
#             first line contains field labels
#
# - internal data representation
#   [see updated list in Table.pm]
#
#   following abbreviations of the data representation concepts are used to
#   name the functions provided in this library.
#
#   A1     line-array of single-field lines (simply an array)
#          if there're additional fields according to the specified delimiting
#          concept they're ignored.
#   AA     line-array of field arrays
#   AC     colunm-array of entry arrays
#   AH     line-array of field hashes
#          The first array element is a pointer to an array of the column
#          labels in original sorting order.
#   HCA    column-hash of field arrays
#   HI2    hash of single-field lines (simply a hash)
#          As a consequence data volume is restricted to or will be reduced
#          to two columns only
#   HIA    hash index on array:
#          each first column values (or column specified via -ColIdx)
#          references an array containing the remaining fields (or all
#          with option -ColKeep)
#   HIH    hash index on hashes:
#          each first column values (or column specified via -ColIdx)
#          references a hash containing the remaining fields (or all
#          with option -ColKeep)
#
################################################################################
#
#  FUNCTIONS
#
#   @EXPORT
#   %LibParam (not exported)
#
# - basics
#   $LibParam{default}
#
# - database input
#   &PlainToTable
#   &LoadFoid
#
# - data formatting
#   &AntiFieldQuote  (not exported)
#   &AntiTableQuote  (not exported)
#
# - database output
#   &OutputTableAH
#
#
#  STD OPTIONS
#
#   -debug      print debug protocol to STDERR
#
################################################################################
#
#  DEBUG, CHANGES, ADDITIONS
#
# - see &PlainToTable, switch -comments
#
# - look also for notes in the header of each function block
#
################################################################################

package database::DbPlain;

# includes
#use strict;  # OK 2003xxxx use warnings;
use MainLib::StrRegexp qw($reEndl);
use MainLib::Data qw(&DataPrint);
use MainLib::File qw(&ReadFile);
use MainLib::Misc qw(&MySub);
use database::Table qw(&TableConvert);

# symbol export
use Exporter qw(import);
our @EXPORT = qw (
  &PlainToTable &LoadFoid
  &OutputTableAH
  );

# package-wide constants and variables
my %LibParam;


################################################################################
# basics
################################################################################

# class constants
$LibParam{default} = {
  delimit => 'TabRet',
  TabType => 'AA',
  };


################################################################################
# database input
################################################################################


# read table data into any table data structure type
#
# INTERFACE
# - argument 1: - file path / input call for table document
#               - reference to input file handle
#               - reference to plain table document
#
# - options:
#   ...           all options that may be handed over to &TableConvert
#   -ColKeep      keep column used for indexation also in array of
#                 values in data line, default: 0
#   -ColLabel     reference to array of column labels that are to be used in
#                 creating hash type target types of table data.
#   -ColShift     shift first line of table data into column label definitions
#   -comments     ignore empty lines or lines beginning with a '#' character
#                 - equal to combination of switches -FltComment and -FltEmpty.
#                 This option must be set if the column labels are meant to
#                 be stripped from the header.
#                 BUG: If the table fields are quoted, the comment character
#                 won't be diagnosed
#   -debug        [STD]
#   -delimit      delimiter concept, cf. 'main description', supported:
#                 - SCALAR 'RetHead'
#                   don't use options -ColLabel, -ColShift, and
#                   -comments here.
#                 - SCALAR 'Ret\d+'
#                   don't use options -ColLabel and -ColShift here.
#                 - SCALAR 'SpaceRet'
#                 - SCALAR 'TabRet' (default delimiter concept)
#                 - HASH { line=>RegexpLine, col=>RegexpCol }
#                   This is the combination of options -delimCol and -delimLine.
#                   Regexps should not contain bracketed expressions, otherwise
#                   the behaviour of field splitting will be unexpected
#   -delimCol     data field delimiter (regexp)
#   -delimLine    data line delimiter (regexp)
#   -FltComment   filter lines from input if beginning with a '#'
#                 This option will be automatically set with option -comments
#   -FltEmpty     filter empty lines from input
#                 This option will be automatically set with option -comments
#   -TabType      table data structure type to be returned, default:
#                 ($LibParam{default}{TabType})
#   -wantscalar   indicate that the return value shall be a scalar though
#                 the function call may be in array context.
#
# - return val: - reference to table data structure
#               - wantarray
#                 - reference to table data structure
#                 - reference to array of column labels (those that've been
#                   read from the input file, specified via option or returned
#                   from conversion function)
#                 - string of concatenated comment lines
#               - undef if an error occurred
#
# DESCRIPTION
# - an array of field labels may be extracted from the table header if
#   following situation is met:
#   - option -comments is set (labels reside there!)
#   - no option -ColLabel (this overrides the feature)
#   - no option -ColShift (this overrides the feature)
#
# DEBUG, CHANGES, ADDITIONS
# - see option -comments (effect of quoted fields)
#
sub PlainToTable {
  my ($PathTable, %opt) = @_;
  my ($debug, $dbg2, $delimit);
  my ($DocTab, $DocComment, $CtI, $CtComment, $CtEmpty, @line);
  my (@TablePrim, $ColumnNum, $pColumn, $pLine, $pTableData);

  ########################################################################
  # pre-work

  # work out parameters
  $debug = $opt{-debug};
  $dbg2  = $debug ? $debug-1 : 0;
  $delimit = $opt{-delimit} || $LibParam{default}{delimit};
  if ($delimit eq 'TabRet')   { $delimit = { col=>'\t', line=>$reEndl }; }
  if ($delimit eq 'SpaceRet') { $delimit = { col=>' +', line=>$reEndl }; }
  if ($opt{-delimCol}) { $$delimit{col}=$opt{-delimCol}; }
  if ($opt{-delimLine}) { $$delimit{line}=$opt{-delimLine}; }
  if (exists($opt{-comment})) { $opt{-comments} ||= $opt{-comment}; }
  if ($opt{-comments}) {
    $opt{-FltComment} = 1;
    $opt{-FltEmpty} = 1;
  }

  # get plain table source
  if (ref($PathTable) eq 'SCALAR') {
    $DocTab = $$PathTable;
  } elsif (! ref($PathTable) or ref($PathTable) =~ m/\b(FileHandle|GLOB)/) {
    unless (defined ($DocTab=&ReadFile($PathTable))) {
      $debug and printf STDERR "%s. ERROR: unable to read file %s\n", &MySub, $PathTable||"''";
      return undef;
    }
  } else {
    $debug and printf STDERR "%s. ERROR: inappropriate type of table input argument %s\n", &MySub, ref($PathTable);
    return undef;
  }

  ########################################################################
  # parse document

  # optionally filter comments / empty lines
  if ($opt{-FltComment}) {
    while ($DocTab =~ m/^#.*?($reEndl)/gmo) {
      $CtComment ++; 
      $DocTab = $` . $';
      pos ($DocTab) = length ($`);
      $DocComment .= $&;
    }
    $debug and printf STDERR "%s. filtered %d comment lines\n", &MySub, $CtComment;
  }
  if ($opt{-FltEmpty}) {
    while ($CtI = ($DocTab =~ s/^($reEndl)//gmo)) { $CtEmpty += $CtI; }
    $debug and printf STDERR "%s. filtered %d empty lines\n", &MySub, $CtEmpty;
  }

  # RetHead delimiter concept
  if ($delimit eq 'RetHead') {

    # split to lines
    @line = split (/$reEndl/, $DocTab);
    undef $DocTab;

    # get column labels
    while ($_ = shift @line) {
      push @$pColumn, $_;
    }
    $ColumnNum = int @$pColumn;
    delete $opt{-ColLabel};
    delete $opt{-ColShift};

    # split to fields
    while (int @line) {
      push @TablePrim, [ splice @line, 0, $ColumnNum ];
    }
  }

  # RetX delimiter concept
  if ($delimit =~ m/^Ret(\d+)$/) {
    $ColumnNum = $1;

    # split to lines
    @line = split (/$reEndl/, $DocTab);
    undef $DocTab;

    # get column labels
    @$pColumn = splice @line, 0, $ColumnNum;
    delete $opt{-ColLabel};
    delete $opt{-ColShift};

    # split to fields
    while (int @line) {
      push @TablePrim, [ splice @line, 0, $ColumnNum ];
    }
  }

  # delimiter concepts specified by a HASH
  elsif (ref($delimit) eq 'HASH') {
    $debug and printf STDERR "%s. applying regexp delimiters: line '%s', column '%s'\n", &MySub,
      $$delimit{line}, $$delimit{col};

    # anti quote, split to lines
    @line = split (/$$delimit{line}/, $DocTab);
#    my ($CtBrack, $CtSep);
#    if ($CtBrack = $$delimit{line} =~ s/(^|[^\\])\(/$&/g) {
#      @line = map { !(($CtSep++)%$CtBrack) ? $_ : () } @line;
#    }
    undef $DocTab;

    # split to fields
    while (@line) {
      push @TablePrim, [ split (/$$delimit{col}/, shift @line) ];
    }
  }

  # unknown delimiter concept
  else {
    $debug and printf STDERR "%s. ERROR: unknown delimiter argument %s, default value %s\n", &MySub,
      $delimit||"''", $LibParam{default}{delimit};
    return undef;
  }

  ########################################################################
  # reformat table data

  # remove field quoting
  foreach $pLine (@TablePrim) {
    for ($CtI=0; $CtI<@$pLine; $CtI++) {
      $$pLine[$CtI] = &AntiFieldQuote ($$pLine[$CtI]);
    }
  }

  # get column labels
  if (wantarray or $opt{-TabType}) {
    if (ref($opt{-ColLabel}) eq 'ARRAY') {
      $pColumn = $opt{-ColLabel};
    } elsif ($opt{-ColShift}) {
      $pColumn = shift @TablePrim;
    } elsif (($opt{-comments} or $opt{-FltComment}) and ref($delimit) and
        $DocComment =~ m/\#\s*(column )?labels:?\s*($reEndl)\#\s*(.*)($reEndl)/io) {
      $pColumn = [ split (/$$delimit{col}/, $3) ];
      $debug and printf STDERR "%s. found column labels in comments: %s\n", &MySub, join (' ', @$pColumn);
    }
  }

  # reformat table to type of choose
  if ($opt{-TabType} and $opt{-TabType} ne $LibParam{default}{TabType}) {
    $opt{-ColLabel} = $pColumn;
    $opt{-debug} = $dbg2;
    $pTableData = &TableConvert (
      $LibParam{default}{TabType}, $opt{-TabType}, 
      \@TablePrim, %opt);
  } else {
    $pTableData = \@TablePrim;
  }
  
  # exit SUB successfully
  return (wantarray and ! $opt{-wantscalar}) ? 
    ($pTableData, $pColumn, $DocComment) : $pTableData;
}


# load file of IDs
#
# INTERFACE
# - argument 1: source argument (path of file, open handle, or SCALAR reference)
#
# - options:    most of &PlainToTable
#
# - return val: array reference
#
sub LoadFoid {
  my ($ArgSrc, %opt) = @_;

  # load via DbPlain::PlainToTable
  return &PlainToTable ($ArgSrc, -TabType=>'A1',
    -delimit    => {line=>$reEndl,col=>'(\t| +)'},
    -FltComment => 1,
    %opt);
}


################################################################################
# data formatting
################################################################################


# remove quoting concept for field values (cf. MS Excel)
#
# INTERFACE
# - argument 1: field value
# - return val: converted field value
#
sub AntiFieldQuote {
  my $field = $_[0];

  $field =~ s/^"(.*)"$/$1/;
  return $field;
}


# remove quoting concept from entire table document (cf. MS Excel)
#
# INTERFACE
# - argument 1: plain table data
# - return val: converted table document
#
# DESCRIPTION
# - this works only for TAB column delimiter
#
sub AntiTableQuote {
  my $DocTab = shift;
  my ($pre, $field, $post);

  while ($DocTab =~ m#(^|\t)"([^\t\n"]*?)"(\t|$)#gm) {
    $pre   = $` . $1;
    $field = $2;
    $post  = $3 . $';
    $field =~ s/""/"/g;
    $DocTab = $pre .$field. $post;
  }

  # exit SUB
  return $DocTab;
}


################################################################################
# database output
################################################################################


# write AH representation to table file
#
# INTERFACE
# - argument 1: reference to table data
# - argument 2: reference to array of field labels
# - argument 3: output file path
#
# - options:
#   -delimit  delimiter concept, cf. 'main description'
#             supported: TabRet (default)
#
# - return val 1: 0 success
#                 X error
#
# DESCRIPTION
# - accept data as AH representation
# - only fields specified in the array of field label (argument 2 -> @Field)
#   are written to the file
#
sub OutputTableAH {
  my @Table = @{ shift @_ };
  my @Field = @{ shift @_ };
  my $PathOutput = shift;
  my %opt = @_;

  # check arguments, open output file
  if (! @Field) {
    printf STDERR "%s. ERROR: missing array of field labels (\@Field)\n", &MySub;
    return 1;
  }
  if (! open (OUTTABLE, ">$PathOutput")) {
    printf "%s. Error: Unable to open file '$PathOutput' for output\n", &MySub;
    return 1;
  }

  # print header
  print OUTTABLE join ("\t", @Field), "\n";

  # print entries
  foreach $Entry (@Table) {
    print OUTTABLE join ("\t", @$Entry{@Field}), "\n";
  }

  # exit sub
  close OUTTABLE;
  return 0;
}


1;
# $Id: DbPlain.pm,v 1.10 2018/06/05 18:02:56 szafrans Exp $
