#! /usr/local/bin/perl

# A simple Perl script for downloading the files
# from any given Protein Data Bank update date.
#
# Type perldoc getPdbUpdate.pl for usage information,
# or view the getPdbUpdate.html file.

$VERSION = '1.1';


#######################
# Documentation (pod) #
#######################

=head1 NAME

getPdbUpdate.pl

=head1 SYNOPSIS

A simple Perl utility for downloading the files
from any given Protein Data Bank update date.

  getPdbUpdate.pl dates
  getPdbUpdate.pl latest
  getPdbUpdate.pl 20020708

=head1 DESCRIPTION

=head2 Overview

getPdbUpdate.pl is a simple Perl program for downloading all files
from the Protein Data Bank (PDB) associated with a particular 
update date, including coordinate files, structure factors, 
and nmr restraints. 

=head2 Requirements

getPdbUpdate.pl requires either one of the following common download utilities. 
On most systems where Perl is installed, one or both of these utilities 
will already be present.

1) LWP::UserAgent, a Perl module for downloading files from 
the World Wide Web. LWP::UserAgent is part of the libwww-perl
module, which is available from CPAN. 
Please see the link in the L<SEE ALSO> section.

2) wget, a common Unix utility for downloading files from 
the World Wide Web. wget is also available for most Windows operating systems.
Please see the link in the L<SEE ALSO> section.

=head2 Usage

The program can be run in three different modes as follows:

1) Get a list of valid update dates:

  getPdbUpdate.pl dates

In this mode, the program will only retrieve and print a list of all valid update dates.

2) Get the files from the latest update:

  getPdbUpdate.pl latest

In this mode, the program will retrieve all files associated with the latest update. 
The files will be saved in some or all of the following directories underneath your current working directory: 

  <yyyymmdd>/added
  <yyyymmdd>/modified
  <yyyymmdd>/obsolete
  <yyyymmdd>/models_added
  <yyyymmdd>/models_modified
  <yyyymmdd>/models_obsolete

3) Get the files from any given update:

  getPdbUpdate.pl <yyyymmdd> 

  e.g.:
  getPdbUpdate.pl 20020708  

In this mode, the program will retrieve all files associated with the specified update date.
Again, the files will be saved in the same directory structure, e.g.:

  20020708/added
  20020708/modified
  20020708/obsolete
  20020708/models_added
  20020708/models_obsolete

=head2 Notes

1) Please note that this program will write some or all of the following additional files 
relative to your current working directory:

  ls-lR
  wgettest.temp 
  <yyyymmdd>/added.pdb
  <yyyymmdd>/added.sf
  <yyyymmdd>/added.nmr
  <yyyymmdd>/modified.pdb
  <yyyymmdd>/modified.sf
  <yyyymmdd>/modified.nmr
  <yyyymmdd>/obsolete.pdb
  <yyyymmdd>/obsolete.sf
  <yyyymmdd>/obsolete.nmr
  <yyyymmdd>/models_added.pdb
  <yyyymmdd>/models_modified.pdb
  <yyyymmdd>/models_obsolete.pdb

If any of these files are present prior to running this program, they will be overwritten!
A safe option would be to create a new directory first, and then run getPdbUpdate.pl, e.g.:

  mkdir newpdbfiles
  cd newpdbfiles
  getPdbUpdate.pl latest

2) If an added file (e.g. pdb1f5d.ent.Z from added.pdb) 
was obsoleted after the specified update date (20001212),
then the program will issue a warning, and save the file in the obsolete directory 
(20001212/obsolete) on local disk instead of the added directory.

3) As of 1 July 2002, all theoretical models were moved to a dedicated area of the 
PDB FTP archive, and effectively all access to them was removed from the PDB Web interface.
If a file (e.g. pdb1pln.ent.Z) represents a theoretical model, the program will 
issue a warning, and save the file in one of the models directories on local disk.

4) By default, the program will access the main PDB FTP servers at SDSC. 
Alternatively, another mirror FTP site may be specified as a second command line
argument, e.g.:

  getPdbUpdate.pl latest ftp://rutgers.rcsb.org/PDB/pub/pdb/

=head1 VERSION

This documentation refers to version 1.1 of getPdbUpdate.pl.

I<Version history:>

  Version  Date        Comments
  1.1      2002-09-13  Moved from LWP::Simple to LWP::UserAgent
                       to support proxies (e.g. from behind a firewall)
  1.0      2002-07-16  First release

=head1 AUTHOR

Wolfgang Bluhm ( mail@wbluhm.com ) for the Protein Data Bank ( info@rcsb.org )

=head1 BUGS

1) Not really a bug, but if your perl locations happens to be 
different from /usr/local/bin/perl, simply run the program as

  perl getPdbUpdate latest 

2) Be aware that only LWP::UserAgent::mirror, but not wget, preserves the original
time stamps of the files being downloaded. getPdbUpdate.pl is intended only for
your personal use, and hence this limitation may be of little consequence to you.
Please note that files downloaded by getPdbUpdate.pl should not be served to the
public through any kind of mirror site.

=head1 SEE ALSO

  http://www.rcsb.org/pdb/ -- Protein Data Bank (PDB) home page
  ftp://ftp.rcsb.org/pub/pdb -- PDB FTP site
  http://www.rcsb.org/pdb/cgi/resultBrowser.cgi?Date::update=1 -- Last PDB Update
  ftp://ftp.rcsb.org/pub/pdb/software -- download page for this script and documentation

  http://www.cpan.org/modules/by-module/LWP/ -- libwww-perl download page
  http://www.gnu.org/software/wget/wget.html -- wget home page

=head1 COPYRIGHT

                            Copyright 2002
               The Regents of the University of California
                          All Rights Reserved


 Permission to use, copy, modify and distribute any part of this PDB
 software for educational, research and non-profit purposes, without fee,
 and without a written agreement is hereby granted, provided that the above
 copyright notice, this paragraph and the following three paragraphs appear
 in all copies.
 
 Those desiring to incorporate this PDB Software into commercial products
 or use for commercial purposes should contact the Technology Transfer
 Office, University of California, San Diego, 9500 Gilman Drive, La Jolla,
 CA 92093-0910, Ph: (858) 534-5815, FAX: (858) 534-7345.
 
 In no event shall the University of California be liable to any party for
 direct, indirect, special, incidental, or consequential damages, including
 lost profits, arising out of the use of this PDB software, even if the
 University of California has been advised of the possibility of such
 damage.
 
 The PDB software provided herein is on an "as is" basis, and the
 University of California has no obligation to provide maintenance,
 support, updates, enhancements, or modifications.  The University of
 California makes no representations and extends no warranties of any kind,
 either implied or express, including, but not limited to, the implied
 warranties of merchantability or fitness for a particular purpose, or that
 the use of the pdb software will not infringe any patent, trademark or
 other rights.

=cut


################
# main program #
################

# The URL of the Protein Data Bank FTP archive
$pdbFtpUrl = "ftp://ftp.rcsb.org/pub/pdb/";

# Optional feature: set ftp url with second command line argument
$pdbFtpUrl = $ARGV[1] if $ARGV[1];

# Print usage info and quit if no command line argument was given
if ( ! $ARGV[0] ) {
	&usage;
	exit(0);
}

# Read command line argument
$updateDate = $ARGV[0];

# Print usage info and quit if command line argument was not
# 'dates', 'latest', or an 8-digit date
if ( $updateDate !~ /^\d{8}$/ 
	and $updateDate !~ /^latest$/i 
	and $updateDate !~ /^dates$/i ) {
	&usage;
	exit(0);
}

# Test whether we have LWP::UserAgent
eval{use LWP::UserAgent;};
if ( ! $@) {
	# LWP::UserAgent loaded successfully
	print STDERR "Using LWP::UserAgent::mirror for file downloads\n";
	$haveLWP = 1;
}
else {
	# If we don't have LWP::UserAgent, test whether we have wget
	eval{ system("wget -q -O wgettest.temp http://www.rcsb.org/pdb/index.html"); };
	if ( ! $@ and -e "wgettest.temp" ) {
		# We have a working wget
		print STDERR "Using wget for file downloads\n";
		$haveLWP = 0;
	}
	else {
		# We have neither LWP::UserAgent nor wget
		print STDERR "This program requires either LWP::UserAgent or wget\n";
		exit(0);
	}
}

# Get the ls-lR file (list of FTP archive content)
&get_lslR;

# Parse the ls-lF file
&parse_lslR;

# Choose between the three valid program modes
if ( $updateDate =~ /^latest$/i ) {
	$updateDate = $latest;
	print STDERR "Latest update date is $latest\n";
}
elsif ( $updateDate =~ /^dates$/i ) {
	&printUpdateDates;
	exit(0);
}
elsif ( ! exists $dates{$updateDate} ) {
	print STDERR "$updateDate is not a valid update date\n";
	&printUpdateDates;
	exit(0);
}

# Get the PDB files
&getFiles;

exit(0);


###############
# subroutines #
###############

# To sort numerically, instead of alphabetically
sub numerically { $a <=> $b }

# Get the ls-lR file (list of FTP archive content)
# and save it to local disk
sub get_lslR {
	print STDERR "Getting ls-lR file (content list of FTP archive)\n";
	$lslR_Url = $pdbFtpUrl . "ls-lR";
	eval{ &download($lslR_Url, "ls-lR"); };
	if ( $@ or ! -e "ls-lR" ) {
		print STDERR "Could not download or save ls-lR file\n";
		print STDERR "Trying one more time: ";
		eval{ &download($lslR_Url, "ls-lR"); };
		if ( $@ or ! -e "$saveAs" ) {
			print STDERR "Sorry, still could not get it. Please try again later.\n";
			exit(0);
		}
		else {
			print STDERR "Ok, got it this time!\n";
		}
	}
}
	
# Parse ls-lR file
# Survey content of the following FTP directories:
#   data/status/<yyyymmdd>
#   structures/{divided|obsolete}/{pdb|structure_factors|nmr_restraints} 
#   structures/models/{current|obsolete}/pdb
# Determine all update dates and the latest update date
#
sub parse_lslR {
	print STDERR "Parsing ls-lR file\n";
	open (IN, "<ls-lR");
	while (<IN>) {
		$line = $_;
		chomp $line;
		# update dates
		if ( $line =~ /data\/status\/(\d{8})/ ) {
			$date = $1;
			$latest = $date if ( $date > $latest );
			$dates{$date} = 1;
		}
		# contents of each update
		elsif ( $date and $line =~ /^-r.*\s(\S+)$/ ) {
			$status{$date}{$1} = 1;
		}
		# experimental structures
		elsif ( $line =~ /structures\/(divided|obsolete)\/(pdb|structure_factors|nmr_restraints)\/(..):/ ) {
			$level1 = $1;
			$level2 = $2;
			$level3 = $3;
		}
		# experimental structures
		elsif ( $level1 and $line =~ /^-r.*\s(\S+)$/ ) {
			$ftpContent{$level1}{$level2}{$level3}{$1} = 1;
		} 
		# theoretical models
		elsif ( $line =~ /structures\/models\/(current|obsolete)\/(pdb)\/(..):/ ) {
			$mlevel1 = $1;
			$mlevel2 = $2;
			$mlevel3 = $3;
		}
		# theoretical models
		elsif ( $mlevel1 and $line =~ /^-r.*\s(\S+)$/ ) {
			$modelContent{$mlevel1}{$mlevel2}{$mlevel3}{$1} = 1;
		}
		# reset
		elsif ( $line !~ /\S/ ) {
			$level1 = $level2 = $level3 = 0;
			$mlevel1 = $mlevel2 = $mlevel3 = 0;
			$date = 0;
		}
	}
	close IN;
}

# Print all update dates
sub printUpdateDates {
        print STDERR "Valid update dates are:\n";
        foreach ( reverse sort numerically keys %dates ) {
                $n++;
                print STDOUT "$_";
                $delimeter = ( $n%8 ? " " : "\n" );
                print STDOUT $delimeter;
        }
}

# Get the PDB file lists
# (added.pdb etc.)
sub getFiles {

	# Create the update date directory if necessary
	if ( ! -e "$updateDate" ) {
		eval{ mkdir "$updateDate", 0777; };
		if ( $@ ) {
			print STDERR "Could not make directory $updateDate\n";
			return;
		}
	}

	# The following are the files in each data/status/<yyyymmdd> directory
	# which in turn list all the PDB files associated with that update
	@fileLists = ( 
                       "added.pdb",    "added.sf",    "added.nmr",
		       "modified.pdb", "modified.sf", "modified.nmr",
		       "obsolete.pdb", "obsolete.sf", "obsolete.nmr",
		       "models_added.pdb",
		       "models_modified.pdb",
		       "models_obsolete.pdb"
		     );
	
	print STDERR "Starting file downloads\n";

	foreach $fileList ( @fileLists ) {

		# Was the fileList (e.g. models_added.pdb) included
		# in the status/<yyyymmdd> directory for the update date?
		if ( exists $status{$updateDate}{$fileList} ) {
			print STDERR "Getting $fileList\n";
		}
		else {
			print STDERR "No $fileList file for update of $updateDate\n";
			next;
		}
			

		# Get the file list from FTP and save to local disk
		$fileListUrl = $pdbFtpUrl . "data/status/$updateDate/$fileList";
		$saveAs = "$updateDate" . "/" . "$fileList";
		eval{ &download($fileListUrl, $saveAs); };
	
		# Provided the file list was successfully downloaded and saved,
		# get each file listed and save to local disk
		if ( $@ or ! -e $saveAs ) {
			print STDERR "Could not download or save file list $updateDate/$fileList\n";
			print STDERR "Trying one more time: ";
			eval{ &download($fileListUrl, $saveAs); };
			if ( $@ or ! -e "$saveAs" ) {
				print STDERR "Sorry, still could not get it. ";
				print STDERR "The download will be missing the files from this list.\n";
			}
			else {
				print STDERR "Ok, got it this time!\n";
			}
		}
		else {
			open (IN, "$saveAs");
			while (<IN>) {
				chomp;
				if ( /^-r.*\s(\S+)$/ ) {
					$fileName = $1;
					&getFile($fileList,$fileName);
				}
			}
			close IN;
		}
	}
}

# Attempt to get an individual PDB file
# (coordinates, structure factors, or nmr restraints)
sub getFile($$) {

	my $fileList = shift;
	my $fileName = shift;

	# Some files are listed without the '.Z' extension 
	# (e.g. in added.pdb), even though all files in the 
	# FTP archive have the '.Z' extention
	if ( $fileName !~ /\.Z$/ ) {
		$fileName .= ".Z";
	}
		
	# Determine the type of file (coordinates, structure factors, nmr restraints)
	# and the hash directory
	if ( $fileName =~ /^pdb.(..).\.ent/ ) {  # coordinate file
		$fileType = "pdb";
		$hashDir = $1;
	}
	elsif ( $fileName =~ /^r.(..).sf\.ent/ ) {  # structure factor file
		$fileType = "structure_factors";
		$hashDir = $1;
	}
	elsif ( $fileName =~ /^.(..).\.mr/ ) {  # nmr constraint file
		$fileType = "nmr_restraints";
		$hashDir = $1;
	}
	else {
		print STDERR "Could not parse the type of file $fileName\n";
		return;
	}
	
	# Set the directory under which the file will be saved on local disk
	$dirName = $updateDate;
	$fileList =~ /^(\S+)\./;
	$subDirName = $1;

	# Assemble FTP download URL
	# 
	# Was the file listed under the corresponding divided (current) directory?
	if ( exists $ftpContent{'divided'}{$fileType}{$hashDir}{$fileName} ) {
		$fileUrl = $pdbFtpUrl . "data/structures/divided/$fileType/$hashDir/$fileName";
	}
	# Was the file listed under the corresponding obsolete directory?
	elsif ( exists $ftpContent{'obsolete'}{$fileType}{$hashDir}{$fileName} ) {
		$fileUrl = $pdbFtpUrl . "data/structures/obsolete/$fileType/$hashDir/$fileName";
		$subDirName = "obsolete";
		if ( $fileList !~ /obsolete/ ) {
			print STDERR "WARNING: file $fileName from list $fileList"; 
			print STDERR " has been obsoleted since $updateDate\n";
		}
	}
	# Current model?
	elsif ( exists $modelContent{'current'}{$fileType}{$hashDir}{$fileName} ) {
		$fileUrl = $pdbFtpUrl . "data/structures/models/current/$fileType/$hashDir/$fileName";
		$subDirName = "models_" . $subDirName unless ( $subDirName =~ /models/ );
		if ( $fileList !~ /models/ ) {
			print STDERR "WARNING: file $fileName from list $fileList";
			print STDERR " is a theoretical model\n";
		}
	}
	# Obsolete model?
	elsif ( exists $modelContent{'obsolete'}{$fileType}{$hashDir}{$fileName} ) {
		$fileUrl = $pdbFtpUrl . "data/structures/models/obsolete/$fileType/$hashDir/$fileName";
		$subDirName = "models_obsolete";
		if ( $fileList !~ /models/ ) {
			print STDERR "WARNING: file $fileName from list $fileList";
			print STDERR " is a theoretical model\n";
		}
		if ( $fileList !~ /obsolete/ ) {
			print STDERR "WARNING: file $fileName from list $fileList"; 
			print STDERR " has been obsoleted since $updateDate\n";
		}
	}
	else {
		print STDERR "ERROR: Could not find file $fileName from list $fileList";
		return;
	}

	# Set the name under which the file will be saved on local disk
	$saveAs = "$dirName/$subDirName/$fileName";

	# Create sub directory (added, modified, obsolete) if necessary
	if ( ! -e "$DirName/$subDirName" ) {
		eval{ mkdir "$dirName/$subDirName", 0777; }; 
		if ( $@ ) {
			print STDERR "Could not make directory $dirName/$subDirName\n";
			return;
		}
	}

	# Get file from FTP and save to local disk
	print STDERR "$fileUrl -> $saveAs\n";
	eval{ &download($fileUrl, $saveAs); };

	if ( $@ or ! -e $saveAs ) {
		print STDERR "Could not download or save file $fileName from list $updateDate/$fileList\n";
		print STDERR "Trying one more time: ";
		eval{ &download($fileUrl, $saveAs); };
		if ( $@ or ! -e "$saveAs" ) {
			print STDERR "Sorry, still could not get it.\n";
			print STDERR "The download will be missing this file.\n";
		}
		else {
			print STDERR "Ok, got it this time!\n";
		}
	}
}

# Download a file and save to local disk
# Uses LWP::UserAgent::mirror if available, or wget otherwise.
# Program would have exited earlier if neither had been available.
sub download($) {
	my $url = shift;
	my $saveAs = shift;

	if ( $haveLWP ) {
		my $ua = LWP::UserAgent->new(env_proxy => 1);
		$ua->mirror($url, $saveAs);
	}
	else {
		system("wget -q -O $saveAs $url");
	}
}

# Print usage information
sub usage {
	print STDERR "Usage: getPdbUpdate.pl latest\n";
	print STDERR "   or: getPdbUpdate.pl dates\n";
	print STDERR "   or: getPdbUpdate.pl <yyyymmdd>\n";
	print STDERR "   eg: getPdbUpdate.pl 20020603\n";
}
