#! /usr/local/bin/perl
################################################################################
#
#  Net Utilities
#
#  copyright (c)
#  Karol Szafranski at Inst. Physiol. Chem. Dsseldorf, 1997-1998 (HTTP Robot)
#  Karol Szafranski on behalf of IMB Jena, Dept. Genome Analysis, 2001-2004,
#    szafrans@imb-jena.de
#
################################################################################
#
#  DESCRIPTION
#
# - See function &usage for description of command line syntax
#
# - each function comes along with a description at the beginning of the code
#   block
#
################################################################################
#
#  FUNCTIONS, DATA
#
# - MAIN
#   $ProgFile,$ProgFstump
#   %ProgParam
#   $ProgMode,%ProgOpt,@ProgArg
#
# - usage help, command line arguments
#   &usage
#   &AddSwitch
#
# - basic netting
#   &ProgGet
#   &ProgDecode
#
# - NCBI
#   &NcbiSeq
#   &ProgCog
#
################################################################################
#
#  DEBUG, CHANGES, ADDITIONS
#
# - look also for notes in the header of each function block
#
################################################################################

# include path(s), includes
BEGIN {
  unshift @INC, grep{$_} split(/:+/,$ENV{KPERLPATH}||$ENV{PERLPATH}||'');
}
use strict; #use warnings;  # OK 20071122
use LWP::UserAgent;
use MainLib::Data;
use MainLib::Path;
use MainLib::Cmdline qw(&GetoptsNArgs &InargFileExists);
use MainLib::File;
use MainLib::Misc;
use MainLib::Internet;
use Math::Calc;
use database::DbPlain;


# script ID
# - program name as specified on the command line
our $ProgFile = ( split('/',__FILE__) )[-1];
our $ProgFstump=$ProgFile; $ProgFstump=~s/\.\w{1,4}$//;

# global constants (esp. default values)
our %ProgParam;
$ProgParam{CgiArg}{COG} = {
  hit  => 3,
  seq  => '',
  };
$ProgParam{NcbiRetr} = {
  asn   => 'RetrvAsn',
  est   => 'RetrvEst',
  fasta => 'RetrvFa',
  genbank => 'RetrvGb',
  gi    => 'RetrvGi',
  };
$ProgParam{default}{ProgMode} = 'get';


# manage I/O #####################################

# organise I/O handles
&Unbuffer();


# command line interface #########################
# NOTE:
# - &GetoptsNArgs (via &AddSwitch) modifies global variables:
#   $ProgMode @ProgArg %ProgOpt
# - @ProgArg may be pre-filled in &GetoptsNArgs via &AddSwitch (option -fofn)

# arguments, switches, default subprogram
our %ProgOpt = ();
our $ProgMode = undef;
our @ProgArg = ();
unshift @ProgArg, &GetoptsNArgs();
$ProgMode ||= $ProgParam{default}{ProgMode};

# eventually open LOG file
if ($ProgOpt{-log}) {
  $ProgOpt{LogFile} = ($ProgOpt{-log}!=1) ? $ProgOpt{-log} : undef;
  &LogOpen (-file=>$ProgOpt{LogFile}, -stamp=>$ProgFstump, -prog=>"$ProgFile -$ProgMode");
}
END {
  $ProgOpt{-log} and &LogClose();
}


# work flow manifold #############################

# chain to program mode without input argument(s)
if (0) { }
elsif (!@ARGV or $ProgMode=~m/^h(elp)?$/i) { &usage() }

# ensure input argument(s)
unless (@ProgArg) {
  die "ERROR: input arguments missing\n";
}

# chain to program mode (with input argument(s))
if (0) { }
elsif ($ProgMode =~ m/^cog$/i) {
  &InargFileExists($ProgArg[0],-stdin=>1,-exit=>\&usage);
  &ProgCog ($ProgArg[0]);
}
elsif ($ProgMode =~ m/^decode$/i) {
  &InargFileExists($ProgArg[0],-stdin=>1,-exit=>\&usage);
  &ProgDecode ($ProgArg[0]);
}
elsif ($ProgMode =~ m/^get$/i) {
  &ProgGet ($ProgArg[0]);
}
elsif ($ProgMode =~ m/^(NcbiSeq|SeqBatch)$/i) {
  &NcbiSeq (@ProgArg);
}
else {
  die "ERROR: unknown program mode or switch '$ProgMode'\n";
}

# exit script successfully
# cf. END blocks!
exit 0;


################################################################################
# usage help, command line arguments, basic I/O
################################################################################


sub usage {
  print "\n";
  print <<END_USAGE;
DESCRIPTION
 $ProgFile performs internet-related task, it especially offers specific
 download facilities.

COMMAND LINE SYNTAX
 $ProgFile -<ModeSwitch> [-<OptionalSwitch> ...] <Arg1> [<Arg2> ...]

Arguments
---------
 See function descriptions.

Path Arguments
--------------
 Relative paths will be resolved according to the pwd. Prefixes "~" and "~uid"
 are resolved to the home directories. Path "-" resolves to STDIN or STDOUT,
 depending on the context.

ModeSwitch (case-insensitive)
-----------------------------
<none>            default ModeSwitch -$ProgParam{default}{ProgMode} if program arguments are given.
                  Otherwise like ModeSwitch -help.
-COG              query NCBI COG server
                  Arg1+       path of query sequence (single-sequence file)
                              plain or fastA sequence format is allowed
-decode           decode document parts which are coded in MIME, UU etc.
                  The program will interactively ask for output filenames.
                  Arg1        document file
-get              simply get a page referred by an URL. An alternative to
                  GNU wget.
                  Arg1        URL
-h(elp)           output command line usage description and exit
-NcbiSeq          NCBI batch sequence download.
                  Arg1+       AccNos.

OptionalSwitch (case-insensitive)
---------------------------------
switch argument types: B:=boolean, F:=floating point/scientific, N:=integer,
S:=string, X:=varying type.

-debug(=N)        print debug protocol to STDERR (sometimes STDOUT). Keep
                  temporary files.
                  N           debug depth value
-fofn=S           supply list of command arguments in a file. The entries of the
                  file will be appended to the argument list. Multiple -fofn
                  switch statements are allowed.
-log(=S)          redirect STDOUT and STDERR to LOG file
                  S           log file path, default path worked out
                              automatically from built-in directives.
-OutFmt=S         output file format

Environment Variables
---------------------
 \$PERLPATH        primary search path for Perl package look-up
 \$TEMPPATH        directory for storage of temporary files, default /tmp

Temporary Files
---------------
 The program may generate temporary files. These will be placed either in a
 directory specified by \$ENV{TEMPPATH} or in /tmp.

Further Reading
---------------
 A detailed, though incomplete, manual is available at
 $CorePath{call}{MeInstDoc}/$CorePath{man}{SeqMotif} .
END_USAGE
  print "\n";
  exit 0;
}


# add program switches to global table (hash)
#
# INTERFACE
# - argument 1:  switch argument without leading '-'
#
# - global options:
#   -debug       [STD]
#
# - global data:
#   $ProgMode
#   @ProgArg     may be pre-filled here beside it's typically used to store
#                return value from &GetoptsNArgs via &AddSwitch (option
#                -fofn)
#   %ProgOpt  switch data which gets processed here
#
# DESCRIPTION
# - this function gets called by &MainLib::Misc::GetoptsNArgs
# - switch arguments are tested for validity. Arguments are parsed with highest
#   possible tolerance. This way, syntax errors can reported in accordance to
#   the actual switch, rather than reporting ANY syntax error.
#
sub AddSwitch {
  my $switch = shift;
  my ($debug);
  my ($SwitchArg, $pTable);

  # optional switches
  if ($switch =~ m/^debug(=(\d+))?$/i) {
    $ProgOpt{-debug} = defined($2) ? int($2) : 1;
    return;
  }
  if ($switch =~ m/^fofn=(.+)$/i) {
    $SwitchArg = ($1 eq '-') ? $1 : &PathExpand($1);
    if ($pTable = &LoadFoid ($SwitchArg)) {
      push @ProgArg, @$pTable;
      $debug and printf STDERR "%s. %d entries loaded from fofn %s\n", &MySub,
        int(@$pTable), $SwitchArg;
    } else {
      die sprintf "ERROR: unable to read entries from file of filenames %s (-> %s)\n",
        $1, $SwitchArg;
    }
    return;
  }
  if ($switch =~ m/^log(=(.*))?$/i) {
    $ProgOpt{-log} = $2 ? &PathExpand($2) : 1;
    return;
  }
  if ($switch =~ m/^OutFmt=(\w+)$/i) {
    $ProgOpt{-OutFmt} = lc ($1);
    return;
  }

  # program mode switches
  if (defined $ProgMode) {
    die sprintf "ERROR: multiple specification of program mode or unknown switch, %s and %s\n",
      '-'.($ProgMode||"''"), '-'.($switch||"''");
  }
  else {
    $ProgMode = $switch;
    return;
  }
}


################################################################################
# basic netting
################################################################################


# download document by URL
#
# INTERFACE
# - argument 1: URL
# - return val: success status (boolean)
#
# DESCRIPTION
# - output is done to STDOUT
#
sub ProgGet {
  my ($url) = @_;

  # do request
  my $pReq = HTTP::Request->new('GET',$url);
  my $pUsrag = LWP::UserAgent->new();
  my $pResp = $pUsrag->request($pReq);

  # analyse response, output result
  if ($pResp->is_success()) {
    print  $pResp->content();
    return 1;
  } else {
    printf STDERR "%s. request failed, error %d, report:\n%s", &MySub,
      $pResp->code(), $pResp->content();
    return 0;
  }
}


# decode document parts which are coded in MIME, UU etc.
#
# INTERFACE
# - argument 1: document file
#
# DESCRIPTION
# - output is done to new directory "msg-*"
#
sub ProgDecode {
  my ($PathDoc) = @_;
  my ($PathDir, $pParser, $pEntity);
  my ($CtPart, $PathSave);

  # parse document
  require MIME::Parser;
  $pParser = MIME::Parser->new();
  $pParser->output_under(&PathCwd());
  $PathDir = $pParser->filer->output_dir();
  unless ($pEntity = $pParser->parse_open ($PathDoc)) {
    printf STDERR "ERROR: parse failed: %s\n", $pParser->last_error();
    $pEntity->dump_skeleton;
  }

  # report paths of created files
  for ($CtPart=0; $CtPart<$pEntity->parts(); $CtPart++) {
    if ($PathSave = $pEntity->parts($CtPart)->{ME_Bodyhandle}{MB_Path}) {
      printf "%s\n", $PathSave;
    }
    foreach (@{$pEntity->parts($CtPart)->{ME_Parts}}) {
      $pEntity->add_part($_);
    }
  }
}


################################################################################
# NCBI
################################################################################


# NCBI batch sequence download
#
# INTERFACE
# - argument 1+: list of AccNos.
#
# - global options:
#   -debug       [STD] in &ProgGet
#   -OutFmt      some of Entrez' output format options
#
# DEBUG, CHANGES, ADDITIONS
# - With EST Acc.Nos. and output format "GenBank" this function produces EST
#   entry output. In single-entry view of Entrez it's possible to select
#   "GenBank" format, but i don't know how to request it for batch downloads.
#
sub NcbiSeq {
  my (@AccNo) = @_;
  my $debug = $ProgOpt{-debug};
  my $format = lc($ProgOpt{-OutFmt}||'') || 'fasta';
  my $RetrFmt = $ProgParam{NcbiRetr}{$format};
  unless (exists $CorePath{www}{NCBI}{$RetrFmt}) {
    die sprintf "ERROR: unknown output format %s -> %s -> %s\n",
      $ProgOpt{-OutFmt}||"''", $format||"''", $RetrFmt||"''";
  }

  # loop over small packages of IDs
  my $SeqPerQuery = 50;
  while (@AccNo) {
    my @SeqCurr = splice (@AccNo, 0, $SeqPerQuery);

    # construct URL string
    my $url = 'http://' . $CorePath{www}{NCBI}{host} . $CorePath{www}{NCBI}{$RetrFmt};
    $url .= join (',', @SeqCurr);

    # continue elsewhere
    $debug and printf STDERR "%s. requesting URL %s\n", &MySub, $url;
    my $ret = &ProgGet ($url);
    unless ($ret) {
      printf STDERR "%s. failed to retrieve sequences:\n", &MySub;
      print  STDERR map { "  $_\n" } @SeqCurr;
    }
  }
}


# query NCBI COG server
#
# INTERFACE
# - argument 1: path of query sequence
#               plain or fastA format is allowed
#
# - global options:
#   -debug      [STD]
#
sub ProgCog {
  my ($ArgSeq) = @_;
  my $debug = $ProgOpt{-debug};
  my $url = 'http://' . $CorePath{www}{NCBI}{host} . $CorePath{www}{NCBI}{COG};

  # do request
  my $pReqDat = &DataClone ($ProgParam{CgiArg}{COG});
  $$pReqDat{seq} = &ReadFile ($ArgSeq);
  my $pReq = HTTP::Request->new('POST',$url);
  $pReq->content($_=&FormdatEncodeURL($pReqDat));
  $pReq->content_length(length());
  $pReq->content_type('application/x-www-form-urlencoded');
  my $pUsrag = LWP::UserAgent->new();
  $pUsrag->prepare_request($pReq);
  if ($debug||1) {
    printf STDERR "%s. resulting HTTP request:\n", &MySub;
    print  STDERR $pReq->as_string();
  }
  my $pResp = $pUsrag->request($pReq);

  # analyse response
  if ($pResp->is_success()) {
    print  $pResp->content();
    return 1;
  } else {
    printf STDERR "%s. request failed, error %d \"%s\" (info %d), header:\n%s", &MySub,
      $pResp->code(), $pResp->message()||'', $pResp->is_info()||0,
      $pResp->header()||'';
    return 0;
  }
}
# $Id: Net.pl,v 1.20 2007/11/22 17:14:23 szafrans Exp $
