#!/usr/bin/perl
############################################################################
# Authors: Junghwan Park, Iliana Avila-Campillo
# Last modified: December 7, 2005 by Iliana
# Data is downloaded from ftp site into ./kgml directory
# Requires Perl module XML::Simple, download and install from CPAN.
############################################################################
use warnings;
use XML::Simple;
use DBI();
use Cwd;
print "------------------- update_kegg.pl --------------------\n";
if(scalar(@ARGV) < 3){
	print "USAGE perl update_kegg.pl <db user> <db password> <db name>\n";
	die;
}
$dbuser = $ARGV[0];
$dbpwd = $ARGV[1];
$dbname = $ARGV[2];
#kdrew: commenting out unused var
#$dbid = "kegg";
# download current KEGG KGML
system('rm -r kgml');
system('mkdir kgml');
print "Getting kgml.tar.gz... ";
system('wget 
ftp://ftp.genome.jp/pub/kegg/xml/kgml.tar.gz --directory-prefix=kgml') == 0 or die "$?\n";
print "done\n";
print "Uncompressing... ";
system('cd kgml;gunzip < kgml.tar.gz | tar xvf -') == 0 or die "$?\n";
print "done\n";
system('cd kgml;rm list.lst');
system('cd kgml;ls -R1 > list.lst'); # recursive and print one entry per line of output
open (LIST, "kgml/list.lst") or die "Could not open kgml/list.lst\n";
$| = 1;
my @filename = ();
print "Storing xml file names into array...\n";
while (<LIST>) {
	chomp;
	if ($_ =~ /^\.\/([A-Za-z0-9]*)\:/) {
		$org = $1; #directory name
	}
	
	if ($_ =~ /([A-Za-z0-9]+)\.xml$/) {
		# xml file
		$filename = './'.$org.'/'.$1.'.xml';
		#print "\t$filename\n";
		push(@filename, $filename);
	}
}
print "done\n";
print "Creating database and tables...";
$dbh = DBI->connect("dbi:mysql:host=localhost", $dbuser, $dbpwd) or die "Can't make database connect: $DBI::errstr\n";
$dbh->do("DROP DATABASE IF EXISTS $dbname");
$dbh->do("CREATE DATABASE $dbname") or die "Error: $dbh->errstr";
$dbh->do("USE $dbname") or die "Error: $dbh->errstr";
$dbh->do("CREATE TABLE path_name 	(path VARCHAR(20),	name VARCHAR(100), 	UNIQUE(path, 	name))") or die "Error: $dbh->errstr";
$dbh->do("CREATE TABLE gene_name 	(gene VARCHAR(20), 	name VARCHAR(20),	UNIQUE(gene, 	name))") or die "Error: $dbh->errstr";
$dbh->do("CREATE TABLE xml_obj 		(id VARCHAR(20), 	type VARCHAR(20), 	UNIQUE(id,	type))") or die "Error: $dbh->errstr";
$dbh->do("CREATE TABLE org_gene 	(org VARCHAR(5), 	gene VARCHAR(20),	UNIQUE(org,	gene))") or die "Error: $dbh->errstr";
$dbh->do("CREATE TABLE org_path 	(org VARCHAR(5), 	path VARCHAR(20),	UNIQUE(org,	path))") or die "Error: $dbh->errstr";
$dbh->do("CREATE TABLE path_gene 	(path VARCHAR(20), 	gene VARCHAR(20),	UNIQUE(path,	gene))") or die "Error: $dbh->errstr";
$dbh->do("CREATE TABLE path_cpd 	(path VARCHAR(20), 	cpd VARCHAR(20),	UNIQUE(path,	cpd))") or die "Error: $dbh->errstr";
$dbh->do("CREATE TABLE path_enz 	(path VARCHAR(20), 	enz VARCHAR(20),	UNIQUE(path,	enz))") or die "Error: $dbh->errstr";
$dbh->do("CREATE TABLE path_path 	(path1 VARCHAR(20), 	path2 VARCHAR(20),	UNIQUE(path1,	path2))") or die "Error: $dbh->errstr";
$dbh->do("CREATE TABLE path_rel 	(path VARCHAR(20), 	entry1 VARCHAR(20), entry2 VARCHAR(20), UNIQUE(path,entry1,entry2))") or die "Error: $dbh->errstr";
$dbh->do("CREATE TABLE path_rxn 	(path VARCHAR(20), 	rxn VARCHAR(20),	UNIQUE(path,	rxn))") or die "Error: $dbh->errstr";
$dbh->do("CREATE TABLE gene_rxn (gene VARCHAR(20), rxn VARCHAR(20), UNIQUE(gene, rxn))") or die "Error: $dbh->errstr";
$dbh->do("CREATE TABLE gene_map (gene VARCHAR(20), map VARCHAR(20), UNIQUE(gene, map))") or die "Error: $dbh->errstr";
$dbh->do("CREATE TABLE enz_map (enz VARCHAR(20), map VARCHAR(20), UNIQUE(enz, map))") or die "Error: $dbh->errstr";
$dbh->do("CREATE TABLE rxn_cpd (rxn VARCHAR(20), cpd VARCHAR(20), UNIQUE(rxn, cpd))") or die "Error: $dbh->errstr";
print "done\n";
$total = $#filename+1;
$cur = 0;
my @sqls = ();
print "KEGG KGML file DB Importing Progress:\n";
#kdrew: commenting out unused var
#$ttt = 100;
$starttime = time;
foreach $filename (@filename) {
	$nowtime = time;	
	if ($cur == 0) {
		$totaltime = 0;
	} else {
		$totaltime = $total * ($nowtime-$starttime)/$cur;
	}
	$lefttime = $totaltime - ($nowtime - $starttime);
	
	$totalhr  = int($totaltime/3600);
	$totalmin = int(($totaltime-$totalhr*3600)/60);
	$totalsec = int($totaltime-$totalhr*3600-$totalmin*60);
	$lefthr   = int($lefttime/3600);
	$leftmin  = int(($lefttime-$lefthr*3600)/60);
	$leftsec  = int($lefttime-$lefthr*3600-$leftmin*60);
	
	printf("\r$filename - progress: %5.2f\%, total: %2d:%2d:%2d, left: %2d:%2d:%2d",($cur*100.0/$total), $totalhr, $totalmin, $totalsec, $lefthr, $leftmin, $leftsec);
	my $file = 'kgml/'.$filename;
	$filename =~ /\.\/([a-z]+)\/([a-z0-9]+)\.xml/;
	
	my $xs1 = XML::Simple->new();
	my $doc = $xs1->XMLin($file);
	$orgname = $doc->{'org'};
	$pathname = $doc->{'name'};
	$dbh->do("INSERT IGNORE INTO org_path VALUES(?, ?)", undef, $orgname, $pathname) or die "Error: $dbh->errstr";
	
	$entry = $doc->{'entry'};
	
	foreach $entry_name (keys(%$entry)) {
		$dbh->do("INSERT IGNORE INTO xml_obj VALUES(?, ?)", undef, $pathname.":".$entry->{$entry_name}->{"id"}, $entry->{$entry_name}->{"type"}) or die "Error: $dbh->errstr";
		
		if ($entry->{$entry_name}->{"type"} eq "gene") {
			$genenames = $entry_name;
			@genenames = split(/\s/, $genenames);
			foreach $genename (@genenames) {
				$dbh->do("INSERT IGNORE INTO org_gene VALUES(?, ?)", undef, $orgname, $genename) or die "Error: $dbh->errstr";
				$dbh->do("INSERT IGNORE INTO path_gene VALUES(?, ?)", undef, $pathname, $genename) or die "Error: $dbh->errstr";
				
				if (exists($entry->{$entry_name}->{"map"})) {
					$dbh->do("INSERT IGNORE INTO gene_map VALUES(?, ?)", undef, $genename, $pathname.":".$entry->{$entry_name}->{"map"}) or die "Error: $dbh->errstr";
				} else {
					@cgenenames = split(/,\s+/,$entry->{$entry_name}->{"graphics"}->{"name"});
					foreach $cgenename (@cgenenames) {
						$dbh->do("INSERT IGNORE INTO gene_name VALUES(?, ?)", undef, $genename, $cgenename) or die "Error: $dbh->errstr";
					}
					@rxnnames = split(/\s+/,$entry->{$entry_name}->{"reaction"});
					foreach $rxnname (@rxnnames) {
						$dbh->do("INSERT IGNORE INTO gene_rxn VALUES(?, ?)", undef, $genename, $rxnname) or die "Error: $dbh->errstr";
					}
				}
			}
		}
		if ($entry->{$entry_name}->{"type"} eq "compound") {
			$dbh->do("INSERT IGNORE INTO path_cpd VALUES(?, ?)", undef, $pathname, $entry_name) or die "Error: $dbh->errstr";
		}
		if ($entry->{$entry_name}->{"type"} eq "enzyme") {
			$dbh->do("INSERT IGNORE INTO path_enz VALUES(?, ?)", undef, $pathname, $entry_name) or die "Error: $dbh->errstr";
			if (exists($entry->{$entry_name}->{"map"})) {
				$dbh->do("INSERT IGNORE INTO enz_map VALUES(?, ?)", undef, $entry_name, $pathname.":".$entry->{$entry_name}->{"map"}) or die "Error: $dbh->errstr";
			}
		}
		if ($entry->{$entry_name}->{"type"} eq "map") {
			$dbh->do("INSERT IGNORE INTO path_path VALUES(?, ?)", undef, $pathname, $entry_name) or die "Error: $dbh->errstr"; 
			
			$fullname = $entry->{$entry_name}->{"graphics"}->{"name"};
			if ($fullname =~ /^TITLE\:([A-Za-z0-9\s\:_-]+)/) {
				$fullname = $1;
			}
			$dbh->do("INSERT IGNORE INTO path_name VALUES(?, ?)", undef, $entry_name, $fullname) or die "Error: $dbh->errstr";
		}
	}
	
	$rxns = $doc->{"reaction"};
	foreach $rxnid (keys(%$rxns)) {
		$dbh->do("INSERT IGNORE INTO path_rxn VALUES(?, ?)", undef, $pathname, $rxnid) or die "Error: $dbh->errstr";
		
		$rxn = $rxns->{$rxnid};
		if (exists($rxn->{"product"})) {
			unless (exists($rxn->{"product"}->{'name'})) {
				foreach $product (keys(%{$rxn->{"product"}})) {
					$dbh->do("INSERT IGNORE INTO rxn_cpd VALUES(?, ?)", undef, $rxnid, $product) or die "Error: $dbh->errstr";
				}
			} else {
				$dbh->do("INSERT IGNORE INTO rxn_cpd VALUES(?, ?)", undef, $rxnid, $rxns->{$rxnid}->{"product"}->{"name"}) or die "Error: $dbh->errstr";
			}
		}
		if (exists($rxn->{"substrate"})) {
			unless (exists($rxn->{"substrate"}->{'name'})) {
				foreach $substrate (keys(%{$rxn->{"substrate"}})) {
					$dbh->do("INSERT IGNORE INTO rxn_cpd VALUES(?, ?)", undef, $rxnid, $substrate) or die "Error: $dbh->errstr";
				}
			} else {
				$dbh->do("INSERT IGNORE INTO rxn_cpd VALUES(?, ?)", undef, $rxnid, $rxns->{$rxnid}->{"substrate"}->{"name"}) or die "Error: $dbh->errstr";
			}
		}
	}
	@rels = $doc->{"relation"};
	foreach $rel (@rels) {
		if (ref($rel) eq 'HASH') {
			$entry1 = $entry2 = $subtype = $value = $type = "";
			$entry1 = $rel2->{"entry1"};
			$entry2 = $rel2->{"entry2"};
			$subtype = $rel2->{"subtype"};
			$value = $pathname.":".$subtype->{"value"};
			$type = $rel2->{"type"} if (exists $rel2->{"type"});
			$dbh->do("INSERT IGNORE INTO path_rel VALUES(?, ?, ?)", undef, $pathname, $entry1, $entry2) or die "Error: $dbh->errstr";
			$dbh->do("INSERT IGNORE INTO path_rel VALUES(?, ?, ?)", undef, $pathname, $entry2, $entry1) or die "Error: $dbh->errstr";
		} else {
			@rels2 = @$rel;
			foreach $rel2 (@rels2) {
				$entry1 = $entry2 = $subtype = $value = $type = "";
				$entry1 = $rel2->{"entry1"};
				$entry2 = $rel2->{"entry2"};
				$subtype = $rel2->{"subtype"};
				$value = $pathname.":".$subtype->{"value"};
				$type = $rel2->{"type"} if (exists $rel2->{"type"});
				$dbh->do("INSERT IGNORE INTO path_rel VALUES(?, ?, ?)", undef, $pathname, $entry1, $entry2) or die "Error: $dbh->errstr";
				$dbh->do("INSERT IGNORE INTO path_rel VALUES(?, ?, ?)", undef, $pathname, $entry2, $entry1) or die "Error: $dbh->errstr";
			}
		}
	}
	$cur++;
}
$starttime = time;
print "CREATE TABLE gene_cpd\n";
$dbh->do("CREATE TABLE gene_cpd (gene VARCHAR(20), cpd VARCHAR(20), KEY(gene, cpd))") or die "Error: $dbh->errstr";
$nowtime = time;
print "time elapsed: ".($nowtime-$starttime)."\n";
$starttime = time;
print "INSERT INTO gene_cpd\n";
$dbh->do("INSERT IGNORE INTO gene_cpd select gene_rxn.gene, rxn_cpd.cpd from gene_rxn, rxn_cpd where gene_rxn.rxn = rxn_cpd.rxn") or die "Error: $dbh->errstr";
$nowtime = time;
print "time elapsed: ".($nowtime-$starttime)."\n";
$starttime = time;
print "Add org col\n";
$dbh->do("ALTER TABLE gene_cpd ADD org VARCHAR(5)") or die "Error: $dbh->errstr";
$nowtime = time;
print "time elapsed: ".($nowtime-$starttime)."\n";
$starttime = time;
print "Update org col\n";
$dbh->do("UPDATE gene_cpd, org_gene SET gene_cpd.org = org_gene.org WHERE gene_cpd.gene = org_gene.gene") or die "Error: $dbh->errstr";
$nowtime = time;
print "time elapsed: ".($nowtime-$starttime)."\n";
$starttime = time;
print "Create Table cpd_score\n";
$dbh->do("CREATE TABLE cpd_score SELECT cpd, count(*) AS score FROM gene_cpd GROUP BY cpd") or die "Error: $dbh->errstr";
$nowtime = time;
print "time elapsed: ".($nowtime-$starttime)."\n";
$starttime = time;
print "add index gene\n";
$dbh->do("ALTER TABLE gene_cpd ADD INDEX (gene)") or die "Error: $dbh->errstr";
$nowtime = time;
print "time elapsed: ".($nowtime-$starttime)."\n";
$starttime = time;
print "add index cpd\n";
$dbh->do("ALTER TABLE gene_cpd ADD INDEX (cpd)") or die "Error: $dbh->errstr";
$nowtime = time;
print "time elapsed: ".($nowtime-$starttime)."\n";
$starttime = time;
print "add index gene, cpd, org\n";
$dbh->do("ALTER TABLE gene_cpd ADD INDEX (gene, cpd, org)") or die "Error: $dbh->errstr";
$nowtime = time;
print "time elapsed: ".($nowtime-$starttime)."\n";
$starttime = time;
print "add index cpd\n";
$dbh->do("ALTER TABLE cpd_score ADD INDEX (cpd)") or die "Error: $dbh->errstr";
$nowtime = time;
print "time elapsed: ".($nowtime-$starttime)."\n";
# gene_cpd_gene_score contains "interactions" between genes that share a compound and are of the same organism
# the score reflects the total number of genes (regardless of organism) that are connected to the compound
$starttime = time;
print "create table gene_cpd_gene_score\n";
$dbh->do("CREATE TABLE gene_cpd_gene_score SELECT g1.gene AS gene1, g1.cpd AS cpd, g2.gene AS gene2, cs.score AS score, g1.org AS org FROM gene_cpd as g1, gene_cpd AS g2, cpd_score AS cs WHERE g1.cpd = g2.cpd AND g1.org = g2.org AND g1.gene != g2.gene AND cs.cpd = g1.cpd") or die "Error: $dbh->errstr";
print "deleting duplicates...\n";
$dbh->do("DELETE FROM gene_cpd_gene_score WHERE gene2 < gene1;");
$nowtime = time;
print "time elapsed: ".($nowtime-$starttime)."\n";
$starttime = time;
print "add index gene1, gene2\n";
$dbh->do("ALTER TABLE gene_cpd_gene_score ADD INDEX (gene1, gene2)") or die "Error: $dbh->errstr";
$nowtime = time;
print "time elapsed: ".($nowtime-$starttime)."\n";
# gene-gene interactions with a minimum score obtained from the minimum score of all compounds that mediate the "interaction"
# this could be used if all we are interested in is connecting genes in a network, and we don't care of the specific compounds that mediate the interactions
# using this table to create interactions minimizes multiple edges between nodes that share many compounds
$starttime = time;
print "create table gene_gene_score\n";
$dbh->do("CREATE TABLE gene_gene_score SELECT gcgs.gene1 AS gene1, gcgs.gene2 AS gene2, gcgs.cpd AS cpd, MIN(gcgs.score) as score, gcgs.org as org FROM gene_cpd_gene_score AS gcgs GROUP BY gcgs.gene1, gcgs.gene2") or die "Error: $dbh->errstr";
$nowtime = time;
print "time elapsed: ".($nowtime-$starttime)."\n";
$starttime = time;
print "create table cpd_name\n";
$dbh->do("CREATE TABLE cpd_name (cpd VARCHAR(20), name VARCHAR(50), UNIQUE(cpd,name))") or die "Error: $dbh->errstr";
print "Getting COMPOUND...";
system('wget 
ftp://ftp.genome.jp/pub/kegg/ligand/compound --directory-prefix=kgml') == 0 or die "$?\n";
print "done.\n";
open (COMPOUND, "kgml/compound") or die "Could not open kgml/compound\n";
open (CP_OUT, ">kgml/compound_name.txt") or die "Could not create file kgml/compound_name.txt\n";
while($line = <COMPOUND>){
	if($line =~ /^ENTRY/){
		@fields = split(/\s+/,$line);
		print CP_OUT "cpd:"."$fields[1]\t";
	}elsif ($line =~ /^NAME/){
		@fields = split(/\s+/,$line);
		chop($fields[1]); # take out the ";"
		print CP_OUT "$fields[1]\n";
	}
}
$fullFilePath = getcwd()."/kgml/compound_name.txt";
$dbh->do("LOAD DATA LOCAL INFILE \'${fullFilePath}\' INTO TABLE cpd_name") or die "Error: $dbh->errstr";
$nowtime = time;
print "time elapsed: ".($nowtime-$starttime)."\n";
close(CP_OUT);
close(COMPOUND);
$dbh->disconnect();
#kdrew: don't call update_synonyms_kegg directly
#	### Lastly, update synonyms related tables
#	#kdrew: changed from bionetbuilder_info to localhost and added 'use' command
#	$bioneth = DBI->connect("dbi:mysql:host=localhost", $dbuser, $dbpwd) or die "Can't make database connect: $DBI::errstr\n";
#	$dbh->do("USE bionetbuilder_info") or die "Error: $dbh->errstr";
#
#	#$dbh = DBI->connect("dbi:mysql:host=localhost", $dbuser, $dbpwd) or die "Can't make database connect: $DBI::errstr\n";
#
#	# get the current synonyms db
#	my $sth = $bioneth->prepare_cached("SELECT dbname FROM db_name WHERE db=?") or die "Error: $dbh->errstr";
#	$sth->execute("synonyms") or die "Error: $dbh->errstr";
#		
#	while ($row = $sth->fetchrow_hashref()) {
#		$synonyms = $row->{'dbname'};
#	}
#
#	system("./update_synonyms_kegg.pl $dbuser $dbpwd ${synonyms} $dbname");
#
print "\n--------------------- Leaving update_kegg.pl ----------------------\n";