#!	/usr/bin/perl	-w

=head1	header

	###########################################################
	#  
	#  This script evaluate gene overlap enrichment
	#  
	#  perl OverlapEnrich.pl BackgroundGenes.txt OverlapGene.txt 1000 100
	#  
	#  Input arguments :
	#     BackgroundGenes.txt : list of background genes used for estimating null distribution (= all genes in the analytical environment. e.g., genes in PPI network)
	#     OverlapGene.txt : list of categories and test genes to be evaluated for enrichent
	#                       One row per one category
	#                       Col 1 : Name of the category
	#                       Col 2 : No. genes overlapping with test genes (should be pre-calculated)
	#                       Col 3- : Test gene list (e.g., drug target genes)
	#     1000 : No. genes sampled from background genes
	#     100 : No. interations to be used to test significance
	#  
	#  Output files :
	#     BackgroundGenes-OverlapGene_2000_1000.txt : No. overlap genes in each iteration step for each category
	#     BackgroundGenes-OverlapGene_2000_1000_Sum.txt : Summary of enrichment analysis
	#  
	#  Any questions to Yukinori Okada (http://plaza.umin.ac.jp/~yokada/datasource/software.htm   yokada@broadinstitute.org)

=cut

use	strict;
use	List::Util;

&main(@ARGV);
exit;

=head1	main
	
=cut

sub	main(@){
	my ($OrgGenefile, $TestGenefile, $numGene, $Rep)=@_;
	my $prefix1 = $OrgGenefile;
	my $prefix2 = $TestGenefile;
	for (my $i=0;$i<4;$i++)	{
		chop($prefix1);
		chop($prefix2);
	}
	my $outfile = $prefix1."-".$prefix2."_".$numGene."_".$Rep.".txt";
	my $outfile2 = $prefix1."-".$prefix2."_".$numGene."_".$Rep."_Sum.txt";
	
	my @OrgGene;
	my @TestGene;
	my @Test;
	my @Thres;
	my @numOver;
	my @Count;
	my @Mean;
	my @ord;
	my @tmpOrgGene;
	
	my $counter = 0;
	my $numTest = 0;
	my $out = "";

	open (INPUT1, "$OrgGenefile");
	while(<INPUT1>){
		chomp;
		my $last = substr $_,-1;
		if ($last eq "\r") {
			substr ($_,-1) = "";
		}
		my @inline = split(/\t/);
		$OrgGene[$counter] = $inline[0];
		$counter++;
	}
	close INPUT1;

	open (INPUT2, "$TestGenefile");
	$counter = 0;
	while(<INPUT2>){
		chomp;
		my $last = substr $_,-1;
		if ($last eq "\r") {
			substr ($_,-1) = "";
		}
		my @inline = split(/\t/);
		
		$Test[$numTest] = $inline[0];
		$Thres[$numTest] = $inline[1];
		
		for (my $i=2;$i<@inline;$i++) {
			$TestGene[$numTest][$i-2] = $inline[$i];
		}
		$numTest++;
	}
	close INPUT2;


	open (OUT, "> $outfile");
	$out = "";
	for (my $i=0;$i<$numTest;$i++) {
		$out .= $Test[$i]."\t";
		$Count[$i] = 0;
		$Mean[$i] = 0;
	}
	print OUT $out."\n";
	for (my $i=0;$i<@OrgGene;$i++) {
		$ord[$i] = $i;
	}
	
	for (my $j=0;$j<$Rep;$j++) {
		my @tmpord = List::Util::shuffle(@ord);
		
		for (my $i=0;$i<$numGene;$i++) {
			$tmpOrgGene[$i] = $OrgGene[$tmpord[$i]]
		}
		
		$out = "";
		for (my $i=0;$i<$numTest;$i++) {
			$numOver[$i] = 0;
			
			for (my $k=0;$k<@{$TestGene[$i]};$k++) {
				if (grep {$_ eq $TestGene[$i][$k]} @tmpOrgGene) {
					$numOver[$i]++;
				}
			}
			$out .= $numOver[$i]."\t";
			
			if ($numOver[$i] >= $Thres[$i]) {
				$Count[$i] += 1/$Rep;
			}
			$Mean[$i] += $numOver[$i]/$Rep;
		}
		print OUT $out."\n";
	}
	close OUT;
	
	
	open (OUT2, "> $outfile2");
	print OUT2 "Category\tOriginalNo.Overlap\tMeanNo.Overlap\tFoldEnrich\tP-value\n";
	for (my $i=0;$i<$numTest;$i++) {
		$out = $Test[$i]."\t".$Thres[$i]."\t".$Mean[$i]."\t".$Thres[$i]/$Mean[$i]."\t".$Count[$i]."\n";
		print OUT2 $out;
	}
	close OUT2;
	
	
}
