#! /usr/local/bin/perl 

# A Perl script for downloading a list of PDB files
# from the Protein Data Bank FTP archive.
#
# Type perldoc getPdbStructures.pl for usage information,
# or view the getPdbStructures.html file.

$VERSION = '1.0';

#
# pod documentation 
#

=head1 NAME

getPdbStructures.pl

=head1 SYNOPSIS

A Perl utility for downloading a list of files from the Protein Data Bank FTP archive.

=head1 USAGE

  getPdbStructures.pl -l <list of PDB IDs> [-d <location for downloaded files> -s -n -c]

=head2 OVERVIEW

getPdbStructures.pl is a Perl program for downloading all Protein Data Bank (PDB) structure files listed
in a text file from the PDB FTP archive. 

=head2 REQUIREMENTS

getPdbStructures.pl requires either one of the following common download utilities.
On most systems where Perl is installed, one or both of these utilities
should already be present.

1) LWP::UserAgent, a Perl module for downloading files from
the World Wide Web. LWP::UserAgent is part of the libwww-perl
module, which is available from CPAN.
Please see the link in the L<SEE ALSO> section.

2) wget, a common Unix utility for downloading files from
the World Wide Web. wget is also available for most Windows operating systems.
Please see the link in the L<SEE ALSO> section.

=head2 ARGUMENTS

-l <list of PDB IDs> = path to text file listing the PDB IDs you wish to download.  This file should contain one four-character PDB ID code per line.  Do not separate the lines with commas.  See the example in the L<NOTES> section.

-d <location for downloaded files> = path to where you want the directory of downloaded files to be placed (optional)

-s = include structure factor files (optional)

-n = include NMR restraint files (optional)

-c = include mmCIF files (optional)

=head2 NOTES

1) Please note that this program will write a file titled "ls-lR" in your current working directory.

2) This program will download PDB-format files by default.  All downloaded files will be in UNIX-compressed (".Z") format.

3) Be aware that only LWP::UserAgent::mirror, but not wget, preserves the original
time stamps of the files being downloaded. getPdbStructures.pl is intended only for
your personal use, and hence this limitation may be of little consequence to you.
Please note that files downloaded by getPdbStructures.pl should not be served to the
public through any kind of mirror site.

4) The text file listing of PDB IDs should list the ID codes one per line as follows:

	100d
	1fjk
	4hhb

The characters can be in either upper or lower case. 

=head2 EXAMPLES

Example 1: Download only PDB-format files into the same directory that the program is located in.

	% getPdbStructures.pl -l list.txt 

Example 2: Download all four file types (PDB, mmCIF, structure factors, NMR restraints) into another directory.

	% getPdbStructures.pl -l list.txt -d ./interesting_structures/pdb/download -s -n -c

=head1 VERSION

This documentation refers to version 1.0 of getPdbStructures.pl.

I<Version history:>

  Version  Date        Comments
  1.0      2003-05-09  First release

=head1 AUTHOR

David J. Padilla for the Protein Data Bank ( info@rcsb.org )

=head1 BUGS

1) Not really a bug, but if your perl location happens to be
different from /usr/local/bin/perl, simply run the program as

  perl getPdbStructures

=head1 SEE ALSO

  http://www.rcsb.org/pdb/ -- Protein Data Bank (PDB) home page
  ftp://ftp.rcsb.org/pub/pdb -- PDB FTP site
  ftp://ftp.rcsb.org/pub/pdb/software -- download page for this script and documentation
  http://www.cpan.org/modules/by-module/LWP/ -- libwww-perl download page
  http://www.gnu.org/software/wget/wget.html -- wget home page

=head1 COPYRIGHT

                            Copyright 2003
               The Regents of the University of California
                          All Rights Reserved


 Permission to use, copy, modify and distribute any part of this PDB
 software for educational, research and non-profit purposes, without fee,
 and without a written agreement is hereby granted, provided that the above
 copyright notice, this paragraph and the following three paragraphs appear
 in all copies.

 Those desiring to incorporate this PDB Software into commercial products
 or use for commercial purposes should contact the Technology Transfer
 Office, University of California, San Diego, 9500 Gilman Drive, La Jolla,
 CA 92093-0910, Ph: (858) 534-5815, FAX: (858) 534-7345.

 In no event shall the University of California be liable to any party for
 direct, indirect, special, incidental, or consequential damages, including
 lost profits, arising out of the use of this PDB software, even if the
 University of California has been advised of the possibility of such
 damage.

 The PDB software provided herein is on an "as is" basis, and the
 University of California has no obligation to provide maintenance,
 support, updates, enhancements, or modifications.  The University of
 California makes no representations and extends no warranties of any kind,
 either implied or express, including, but not limited to, the implied
 warranties of merchantability or fitness for a particular purpose, or that
 the use of the pdb software will not infringe any patent, trademark or
 other rights.

=cut



#
# MAIN PROGRAM
#

# Handle command-line options
use Getopt::Std;
getopts('hsncd:l:', \%opts); # Options -s -n -c mean include structure factor, nmr, and cif files respectively
		   	   # Option l takes as an argument the location of the listing of PDB files
			   # Option d takes as an argument the location for the directory of downloaded files (optional)
		   	   # Option h prints usage information

# If '-h' option was invoked, print usage info
if (exists $opts{'h'}) {
	&usage;
        exit(0);
}

# If no '-l' option was used, ask user to print usage information and exit
if (! exists $opts{'l'}) { 
	print STDERR "No location of file listing was included.  Use '-h' option for help\n" ;
        exit(0);
} else {
	$fileName = $opts{'l'};
}

# Handle remaining command line options
if (exists $opts{'s'}) {
        $bGetSf = 1; # Boolean for whether to download structure factor files
}
if (exists $opts{'n'}) {
        $bGetNmr = 1; # Boolean for whether to download NMR files
}
if (exists $opts{'c'}) {
        $bGetCif = 1; # Boolean for whether to download mmCIF files
}
if (exists $opts{'d'}) {
	$downloadLocation = $opts{'d'}; 
	# Check that the directory exists
	if ( ! -e $downloadLocation) {
		print STDERR "$downloadLocation does not exist.  Exiting...\n";
		exit(0);
	}
} else {
	# Download into local directory as defualt
	$downloadLocation = ".";
}

# Other global variables we will need
($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
$year = $year - 100; # Handle century
if (length($year) < 2) { $year = "0" . $year;} # Make the years all two digits
if (length($mon) < 2) { $mon = "0" . $mon;} # Make the months all two digits 
if (length($mday) < 2) { $mday = "0" . $mday;} # Make the dates all two digits

$pdbFtpUrl = "ftp://ftp.rcsb.org/pub/pdb"; # The URL of the Protein Data Bank (PDB) FTP archive
$downloadDir = "$downloadLocation/pdb$year$mon$mday$hour$min"; # The name we will give to the download directory

%ftpLocations = (); # Hash to hold available current FTP archive files and locations
%modelLocations = (); # Hash to hold theoretical model files and locations
%getTheseFiles = (); # Hash containing all files that will be actually downloaded
%dirSetup = (); # Hash to hold list of directories we will need set up

# Give %dirSetup object hooks (we will need this to retreive the directories in insertion order
#use Tie::IxHash;
#tie $dirSetup, "Tie::IxHash";

# Start logging information
print STDERR "$0 run $year$mon$mday at $hour:$min:$sec\n\n";
 
# Check if we have LWP::UserAgent or wget
print STDERR "Checking if we have either LWP::UserAgent or wget...\n";
eval{use LWP::UserAgent;};
if ( ! $@) {
        # We have LWP::UserAgent 
        print STDERR "Using LWP::UserAgent::mirror for file downloads\n";
        $haveLWP = 1;
}
else {
        # If we don't have LWP::UserAgent, test whether we have wget
        eval{ system("wget -q -O wgettest.temp http://www.rcsb.org/pdb/index.html"); };
        if ( ! $@ and -e "wgettest.temp" ) {
                # We have wget
                print STDERR "Using wget for file downloads\n";
                $haveLWP = 0;
        }
        else {
                # We have neither LWP::UserAgent nor wget
                print STDERR "This program requires either LWP::UserAgent or wget.  Exiting.\n";
                exit(0);
        }
}

# Retrieve and parse the ls-lR file from the PDB FTP server
print STDERR "\n";
get_lslR();
parse_lslR();

# Parse the file listing
print STDERR"\n";
&parseFileListing;

# Set up the local directories for the files
&setUpDirs;

# Retrieve the actual files
print STDERR"\nDOWNLOADING\n";
&retrieveFiles;

# Process complete
print STDERR "\nProcess complete.\n" ;



#
# SUBROUTINES
#

sub get_lslR {
        my $lslR_Url = $pdbFtpUrl . "/" . "ls-lR";
	print STDERR "Attempting to get ls-lR file (content list of FTP archive) from $lslR_Url...\n";

        eval{ &download($lslR_Url, "ls-lR"); };
        if ( $@ or ! -e "ls-lR" ) {
                print STDERR "Could not download or save ls-lR file\n";
                print STDERR "Trying one more time: ";
                eval{ &download($lslR_Url, "ls-lR"); };
                if ( $@ ) {
                        print STDERR "Still could not download or save ls-lR file.  Program exiting.  Please try again later.\n";
                        exit(0);
                }
                else {
                        print STDERR "Ok, got the ls-lR file on the second try.  Continuing...\n";
                }
        } else {
		print STDERR "\tGot the ls-lR file\n";
	}
}

sub download($) {
        my $url = shift;
        my $saveAs = shift;

        if ( $haveLWP ) {
                my $ua = LWP::UserAgent->new(env_proxy => 1);
                $ua->mirror($url, $saveAs);
        }
        else {
                system("wget -q -O $saveAs $url");
        }
}

sub parse_lslR {
	print STDERR "Parsing ls-lR file...\n";

        open (IN, "<ls-lR");
        while (<IN>) {
                $line = $_;
                chomp $line;
                # Handle experimental area lines
                if ( $line =~ /structures\/(divided|obsolete)\/(pdb|mmCIF|structure_factors|nmr_restraints)\/(..):/ ) {
			$current = $1;
			$type = $2;
			$hash = $3;
                }
                # experimental structures
                elsif ( $current and $line =~ /^-r.*\s(\S+)$/ ) {
                        $ftpLocations{$1} = [ "$current", "$type", "$hash"]; 
                }
                # Handle theoretical model areas of the PDB FTP archive
                elsif ( $line =~ /structures\/models\/(current|obsolete)\/(pdb|mmCIF)\/(..):/ ) {
			$mcurrent = $1;
			$mtype = $2;
			$mhash = $3;
                }
                # theoretical models
                elsif ( $mcurrent and $line =~ /^-r.*\s(\S+)$/ ) {
                        $modelLocations{$1} = [ "$mcurrent", "$mtype", "$mhash" ];
                }
                # Reset variables
                elsif ( $line !~ /\S/ ) {
			$current = $mcurrent = 0;
                        $type = $mtype = 0;
                        $hash = $mhash = 0;
                }
        }
        close IN;
}

sub parseFileListing {
# Files in the text listing are first checked to see if they are in the correct PDB ID format. 
# They are checked for actual existence in the PDB archive by comparing them to the parsed ls-lR file.
# For files which are in the archive, a hash (two-letter code) subdirectory is created as needed. 
# The parsed files are then placed in a hash called %getTheseFiles. 

	print STDERR "Parsing list of requested files in $fileName...\n";

	# Open the list and check the format of each string, one line at a time.
	open(FILELIST, "$fileName");

        while(<FILELIST>) {
                my $pdbId = $_;
		# Remove newlines and carriage returns
                chomp($pdbId);
		$pdbId =~ s/\r$//;
		# Remove any extra space at the ends of the string
		$pdbId =~ s/^\s+//;
		$pdbId =~ s/\s+$//;
		# Convert to lower case
		$pdbId = lc($pdbId);
		
		# Initialize "boolean" variables
		$isExpt = $isModel = 0;
 	       	$nmrString = $sfString = $cifString = $pdbString = "";

		# Check that id code is in the correct format
		if ($pdbId =~ m/^\s*$/) {
			# Blank line - we won't waste user's disk space on warnings about these
			next;
		}
		if ($pdbId !~ m/^[a-zA-Z0-9]{4}$/) {
			# This is not in the correct format (alphanumeric 4-character code)
			print STDERR "$pdbId is... \tnot in the correct PDB ID format and will be skipped\n";
			next;
		}

		# Check for location(s) of files based on user's file-type preferences
		print STDERR "$pdbId is...";

		$pdbString = "pdb" . $pdbId . ".ent.Z";
		# Check whether this is a current structure or a model.  
		if (exists ($ftpLocations{$pdbString}) ) {
			@fileLocation = @{$ftpLocations{$pdbString}};
			$isExpt = 1;
			$dirSetup{"pdb"} = 1;	# We will need this subdirectory
			print STDERR "\tan experimental structure\n";
		} elsif (exists ($modelLocations{$pdbString}) ) {
			@fileLocation = @{$modelLocations{$pdbString}};
			$isModel = 1;
			$dirSetup{"models"} = 1;  # We will need this subdirectory
			print STDERR "\ta theoretical model\n";
		} else {
			# The structure is neither experimental not a model.  Exit the loop.
			print STDERR "\tnot found in the PDB and will be skipped\n";
			next;
		}
		
		# We will always download the PDB format file
		if ($isExpt == 1) {
			if ($fileLocation[0] eq "divided") {
				$subdir = $fileLocation[2];
				# Add subdirectory to setup hash
				$dirSetup{"pdb/$subdir"} = 1;			
				# Add to hash of structures to be downloaded
				$downloadValue = "structures/$fileLocation[0]/$fileLocation[1]/$subdir/$pdbString";
				$getTheseFiles{"$downloadDir/pdb/$subdir/$pdbString"} = "$downloadValue";
				print STDERR "\t\tPDB format file: $pdbString available\n";
			} elsif ($fileLocation[0] eq "obsolete") {
				$subdir = $fileLocation[2];
				$dirSetup{"obsolete/pdb/$subdir"} = 1;    
				$downloadValue = "structures/$fileLocation[0]/$fileLocation[1]/$subdir/$pdbString";
                                $getTheseFiles{"$downloadDir/obsolete/pdb/$subdir/$pdbString"} = "$downloadValue";
                        	print STDERR "\t\tPDB format file: $pdbString available\n";
			} else {
				print STDERR "\t\tPDB format file: $pdbString NOT available\n";
			}
		} elsif ($isModel == 1) {
			if ($fileLocation[0] eq "current") {
                                $subdir = $fileLocation[2];
				$dirSetup{"models/current/pdb/$subdir"} = 1;    
				$downloadValue = "structures/models/$fileLocation[0]/$fileLocation[1]/$subdir/$pdbString";
                                $getTheseFiles{"$downloadDir/models/current/pdb/$subdir/$pdbString"} = "$downloadValue";
                                print STDERR "\t\tPDB format file: $pdbString available\n";
                        } elsif ($fileLocation[0] eq "obsolete") {
                                $subdir = $fileLocation[2];
				$dirSetup{"models/obsolete/pdb/$subdir"} = 1;
				$downloadValue = "structures/models/$fileLocation[0]/$fileLocation[1]/$subdir/$pdbString";
                                $getTheseFiles{"$downloadDir/models/obsolete/pdb/$subdir/$pdbString"} = "$downloadValue";
                                print STDERR "\t\tPDB format file: $pdbString available\n";
                        } else {
                                print STDERR "\t\tPDB format file: $pdbString NOT available\n";
                        }
		}
		
		# Handle other files types based on user's request and ID's location (experimental vs. theoretical model)  

		# Handle any existing structure factor files
		if ($bGetSf == 1 && $isExpt == 1) {
			# User wants structure factor files downloaded
			$sfString = "r" . $pdbId . "sf.ent.Z";
			@fileLocation = @{$ftpLocations{$sfString}};
			# Generate the hash directory in the correct location, if necessary
			if ($fileLocation[0] eq "divided") {
				$subdir = $fileLocation[2];
				$dirSetup{"structure_factors/$subdir"} = 1;
				$downloadValue = "structures/$fileLocation[0]/$fileLocation[1]/$subdir/$sfString";
                                $getTheseFiles{"$downloadDir/structure_factors/$subdir/$sfString"} = "$downloadValue";
				print STDERR "\t\tStructure factor file: $sfString available\n";
			} elsif ($fileLocation[1] eq "obsolete") {
				$dirSetup{"obsolete/structure_factors/$subdir"} = 1;
				$downloadValue = "structures/obsolete/$fileLocation[0]/$fileLocation[1]/$subdir/$sfString";
                                $getTheseFiles{"$downloadDir/obsolete/structure_factors/$subdir/$sfString"} = "$downloadValue";
				print STDERR "\t\tStructure factor file: $sfString available\n";
			} else {
				# Structure factor not available
			}
		} elsif ($bGetSf == 1 && $isModel == 1) {
			# There will be nothing to download: by definition, theoretical models do not have experimental data
		}

		# Handle any existing NMR restraint files.
		if ($bGetNmr == 1 && $isExpt ==1) {
                        $nmrString = $pdbId . ".mr.Z";
                        @fileLocation = @{$ftpLocations{$nmrString}};
                        
                        if ($fileLocation[0] eq "divided") {
                                $subdir = $fileLocation[2];
				$dirSetup{"nmr/$subdir"} = 1;
				$downloadValue = "structures/$fileLocation[0]/$fileLocation[1]/$subdir/$nmrString";
                                $getTheseFiles{"$downloadDir/nmr/$subdir/$nmrString"} = "$downloadValue";
                                print STDERR "\t\tNMR restraint file: $nmrString available\n";
                        } elsif ($fileLocation[1] eq "obsolete") {
				$dirSetup{"obsolete/nmr/$subdir"} = 1;
				$downloadValue = "structures/obsolete/$fileLocation[0]/$fileLocation[1]/$subdir/$nmrString";
                                $getTheseFiles{"$downloadDir/obsolete/nmr/$subdir/$nmrString"} = "$downloadValue";
                                print STDERR "\t\tNMR restraint file: $nmrString available\n";
                        } else {
                                # NMR file not available
                        }	
		} elsif ($bGetNmr == 1 && $isModel == 1) {
			# There will be nothing to download: by definition, theoretical models do not have experimental data
		}

		# Handle mmCIF files
		if ($bGetCif == 1 && $isExpt == 1) {
                        $cifString = $pdbId . ".cif.Z";
                        @fileLocation = @{$ftpLocations{$cifString}};
                             
                        if ($fileLocation[0] eq "divided") {    
                                $subdir = $fileLocation[2];     
				$dirSetup{"mmCIF/$subdir"} = 1;
				$downloadValue = "structures/$fileLocation[0]/$fileLocation[1]/$subdir/$cifString";
                                $getTheseFiles{"$downloadDir/mmCIF/$subdir/$cifString"} = "$downloadValue";
                                print STDERR "\t\tmmCIF format file: $cifString available\n";   
                        } elsif ($fileLocation[1] eq "obsolete") {      
				$dirSetup{"obsolete/mmCIF/$subdir"} = 1;
				$downloadValue = "structures/obsolete/$fileLocation[0]/$fileLocation[1]/$subdir/$cifString";
                                $getTheseFiles{"$downloadDir/obsolete/nmr/$subdir/$cifString"} = "$downloadValue";
                                print STDERR "\t\tmmCIF format file: $cifString available\n";  
                        } else {        
                                # mmCIF files not available    
                        }     
		} elsif ($bGetCif == 1 && $isModel == 1) {
			# User wants CIF files downloaded - these are available for theoretical models
			$cifString = $pdbId . "cif.Z";
                        @fileLocation = @{$modelLocations{$cifString}};
                        
                        if ($fileLocation[0] eq "current") {
                                $subdir = $fileLocation[2];
				$dirSetup{"models/current/mmCIF/$subdir"} = 1;
				$downloadValue = "structures/models/$fileLocation[0]/$fileLocation[1]/$subdir/$cifString";
                                $getTheseFiles{"$downloadDir/models/current/mmCIF/$subdir/$cifString"} = "$downloadValue";
                                print STDERR "\t\tmmCIF format file: $cifString available\n";
                        } elsif ($fileLocation[1] eq "obsolete") {
				$dirSetup{"models/obsolete/mmCIF/$subdir"} = 1;
				$downloadValue = "structures/models/$fileLocation[0]/$fileLocation[1]/$subdir/$cifString";
                                $getTheseFiles{"$downloadDir/models/obsolete/mmCIF/$subdir/$cifString"} = "$downloadValue";
                                print STDERR "\t\tmmCIF format file: $cifString available\n";
                        } else {
                                # mmCIF file not available
                        }    
		}
	}
	close(FILELIST);
}

sub retrieveFiles{
	foreach(sort keys %getTheseFiles) {
		$local = $_;
		$ftp = $getTheseFiles{$_};
		&download("$pdbFtpUrl/data/$ftp", $local);	
		print STDERR "$local downloaded from $pdbFtpUrl/data/$ftp\n";
	}
}


sub setUpDirs {
	# Set up the directories needed for the downloaded files. 

	# Set up the basic download directory .  We will always need this directory
	eval{ mkdir "$downloadDir", 0777; };
       	if ( $@ ) {
               	print STDERR "Could not make directory $downloadDir\n";
               	return;
       	}

	# Set up other directories based on values in the directory-setup hash

	foreach $dir ( sort keys %dirSetup ) {
		@subdirs = split(/\//, $dir);

		foreach $subdir(@subdirs) {
			$make = $make . $subdir . "/";
			if (! -e "$downloadDir/$make") {
				eval{mkdir "$downloadDir/$make", 0777;};
				#print STDERR "$downloadDir/$make\n";
				if ( $@ ) {
					print STDERR "Could not make directory $downloadDir/$make\n";
                        		return;
				}
			}
		}
		$make = "";
	}

}

# Print usage information
sub usage {
        print STDERR "Usage: getPdbStructures.pl -l <listing> [-d <location> -s -n -c]\n";
        print STDERR "where\n";
	print STDERR "	-l <listing> = the path to the text file listing the PDB IDs you wish to download\n";
	print STDERR "	-d <location> = the path to where the files should be placed (optional - defaults to current directory)\n";
	print STDERR "	-s  = include structure factor files (optional)\n";
	print STDERR "	-n  = include NMR restraint files (optional)\n";
	print STDERR "	-c  = include mmCIF files (optional)\n";
}
