#!	/usr/bin/perl	-w

=head1	header

	###########################################################
	#  
	#  This script makes annotation of SNP info in reference haplotype data
	#  Also makes flags of SNP include/exclude based on defined QC thresholds (MAF, HWE-P)
	#  
	#  perl make1000GHapAnnot.pl Chr22_Ref_Ex
	#  
	#  Input : Chr22_Ref_Ex.hap, Chr22_Ref_Ex.map
	#  Output : Chr22_Ref_Ex.annot.txt, Chr22_Ref_Ex_flag.txt
	#  
	#  1KG haplotype data is available from http://www.sph.umich.edu/csg/abecasis/MACH/download/
	#  
	#  Any questions to Yukinori Okada (http://plaza.umin.ac.jp/~yokada/datasource/software.htm   yokada@broadinstitute.org)

=cut



&main(@ARGV);
exit;

=head1	main

=cut

sub	main(@){
	my ($input_head)=@_;
	my $hap = $input_head.".hap";
	my $map = $input_head.".map";
	my $Annot = $input_head.".annot.txt";
	my $flag = $input_head."_flag.txt";
	my @Info;
	my $counter = 0;
	my $out = "";
	my $outflag = "";
	my @ref = ("A", "T", "G", "C");
	my @refG = ("AA", "AC", "AG", "AT", "CC", "CG", "CT", "GG", "GT", "TT");
	my @Geno;

	open (INPUTmap, "$map");
	while(<INPUTmap>){
		chomp;
		my @inline = split(/\s/);
		$Info[$counter][0] = $inline[0];
		$Info[$counter][1] = $inline[1];
		$Info[$counter][2] = $inline[2];
		
		for (my $i=3;$i<=14;$i++) {
			$Info[$counter][$i] = 0;
		}
		$Info[$counter][8] = "-";
		
		for (my $j=0;$j<@refG;$j++) {
			$Geno[$counter][$j] = 0;
		}
		$counter++;
	}
	close INPUTmap;
	
		my $dip = 0;
		$numHap = 0;
		open (INPUT, "$hap");
		while(<INPUT>){
			chomp;
			my @inline = split(/ /);
			my @inline2 = split(//, $inline[2]);
			
			for(my $i=0;$i<@inline2;$i++){
				if ($inline2[$i] eq "A") {
					$Info[$i][3]++;
				} elsif ($inline2[$i] eq "T") {
					$Info[$i][4]++;
				} elsif ($inline2[$i] eq "G") {
					$Info[$i][5]++;
				} elsif ($inline2[$i] eq "C") {
					$Info[$i][6]++;
				}
				
				if ($dip == 0) {
					$geno[$i] = $inline2[$i];
				} else {
					my @tmp = sort($inline2[$i], $geno[$i]);
					$geno[$i] = $tmp[0]."".$tmp[1];
#					print $geno[$i]."\n";
					for (my $j=0;$j<@refG;$j++) {
						if ($geno[$i] eq $refG[$j]) {
							$Geno[$i][$j]++;
						}
					}
				}
			}
			
			if ($dip==0) {
				$dip = 1;
			} else {
				$dip = 0;
			}
			$numHap++;
		
		}
		close INPUT;
		
		open (OUT, "> $Annot");
		open (OUTflag, "> $flag");
		$out = "SNPID\tchr\tposi\tA\tT\tG\tC\tA1\tA2\tAF1\tAF2\tA11\tA12\tA22\tHWE-P\n";
		print OUT $out;
		
		for (my $i=0;$i<$counter;$i++) {
			my @Allele = ($Info[$i][3], $Info[$i][4], $Info[$i][5], $Info[$i][6]);
			my @AlleleSort = sort($Info[$i][3], $Info[$i][4], $Info[$i][5], $Info[$i][6]);
#			if ($AlleleSort[0]==0 && $AlleleSort[1]==0 && $AlleleSort[2]!=0 && $AlleleSort[3]!=0) {
			if ($AlleleSort[0]==0 && $AlleleSort[1]==0 && $AlleleSort[3]!=0) {
				my $counter2 = 0;
				for (my $j=0;$j<4;$j++) {
					if ($Allele[$j]>0) {
						$Info[$i][7+$counter2] = $ref[$j];
						$Info[$i][9+$counter2] = $Allele[$j]/$numHap;
						$counter2++;
					}
				}
				
				my @tmp11 = sort($Info[$i][7], $Info[$i][7]);
				my @tmp12 = sort($Info[$i][7], $Info[$i][8]);
				my @tmp22 = sort($Info[$i][8], $Info[$i][8]);
				my $tmpgeno11 = $tmp11[0]."".$tmp11[1];
				my $tmpgeno12 = $tmp12[0]."".$tmp12[1];
				my $tmpgeno22 = $tmp22[0]."".$tmp22[1];
				
				for (my $j=0;$j<@refG;$j++) {
					if ($tmpgeno11 eq $refG[$j]) {
						$Info[$i][11] = $Geno[$i][$j];
					}
					if ($tmpgeno12 eq $refG[$j]) {
						$Info[$i][12] = $Geno[$i][$j];
					}
					if ($tmpgeno22 eq $refG[$j]) {
						$Info[$i][13] = $Geno[$i][$j];
					}
				}
				$Info[$i][14] = &snphwe($Info[$i][11], $Info[$i][12], $Info[$i][13]);
				
				if ($Info[$i][9]>0.001 && $Info[$i][10]>0.001 && $Info[$i][14]>=0.0000001) { # MAF>0.001 & HWE-P>0.0000001
					$outflag = "1\n";
				} else {
					$outflag = "0\n";
				}
			} else {
				$Info[$i][7] = "-";
				$Info[$i][8] = "-";
				$Info[$i][9] = 0;
				$Info[$i][10] = 0;
				$Info[$i][11] = 0;
				$Info[$i][12] = 0;
				$Info[$i][13] = 0;
				$Info[$i][14] = "-";
			
				$outflag = "0\n";
			}
			$out = $Info[$i][1]."\t".$Info[$i][0]."\t".$Info[$i][2]."\t".$Info[$i][3]."\t".$Info[$i][4]."\t".$Info[$i][5]."\t".$Info[$i][6]."\t".$Info[$i][7]."\t".$Info[$i][8]."\t".$Info[$i][9]."\t".$Info[$i][10]."\t".$Info[$i][11]."\t".$Info[$i][12]."\t".$Info[$i][13]."\t".$Info[$i][14]."\n";
			
			print OUT $out;
			print OUTflag $outflag;
		}
		close OUT;
		close OUTflag;


}

sub snphwe {
    my ($obs_hom1, $obs_hets, $obs_hom2) = @_;
#    my $obs_hets = shift;
#    my $obs_hom1 = shift;
#    my $obs_hom2 = shift;

    if($obs_hom1 < 0 || $obs_hom2 < 0 || $obs_hets <0) {
	return(-1);
    }

    # rare homozygotes
    my $obs_homr;

    # common homozygotes
    my $obs_homc;
    if($obs_hom1 < $obs_hom2) {
	$obs_homr = $obs_hom1;
	$obs_homc = $obs_hom2;
    } else {
	$obs_homr = $obs_hom2;
	$obs_homc = $obs_hom1;
    }

    # number of rare allele copies
    my $rare_copies = 2 * $obs_homr + $obs_hets;

    # total number of genotypes
    my $genotypes = $obs_homr + $obs_homc + $obs_hets;

    if($genotypes <= 0) {
	return(-1);
    }
    
    # Initialize probability array
    my @het_probs;
    for(my $i=0; $i<=$rare_copies; $i++) {
	$het_probs[$i] = 0.0;
    }

    # start at midpoint
    my $mid = int($rare_copies * (2 * $genotypes - $rare_copies) / (2 * $genotypes));

    # check to ensure that midpoint and rare alleles have same parity
    if(($rare_copies & 1) ^ ($mid & 1)) {
	$mid++;
    }
    
    my $curr_hets = $mid;
    my $curr_homr = ($rare_copies - $mid) / 2;
    my $curr_homc = $genotypes - $curr_hets - $curr_homr;

    $het_probs[$mid] = 1.0;
    my $sum = $het_probs[$mid];
    for($curr_hets = $mid; $curr_hets > 1; $curr_hets -= 2) {
	$het_probs[$curr_hets - 2] = $het_probs[$curr_hets] * $curr_hets * ($curr_hets - 1.0) / (4.0 * ($curr_homr + 1.0) * ($curr_homc + 1.0));
	$sum += $het_probs[$curr_hets - 2];

	# 2 fewer heterozygotes for next iteration -> add one rare, one common homozygote
	$curr_homr++;
	$curr_homc++;
    }

    $curr_hets = $mid;
    $curr_homr = ($rare_copies - $mid) / 2;
    $curr_homc = $genotypes - $curr_hets - $curr_homr;
    for($curr_hets = $mid; $curr_hets <= $rare_copies - 2; $curr_hets += 2) {
	$het_probs[$curr_hets + 2] = $het_probs[$curr_hets] * 4.0 * $curr_homr * $curr_homc / (($curr_hets + 2.0) * ($curr_hets + 1.0));
	$sum += $het_probs[$curr_hets + 2];
	
	# add 2 heterozygotes for next iteration -> subtract one rare, one common homozygote
	$curr_homr--;
	$curr_homc--;
    }

    for(my $i=0; $i<=$rare_copies; $i++) {
	$het_probs[$i] /= $sum;
    }

    # alternate p-value calculation for p_hi/p_lo
#    my $p_hi = $het_probs[$obs_hets];
#    for(my $i=$obs_hets+1; $i<=$rare_copies; $i++) {
#	$p_hi += $het_probs[$i];
#    }
#    
#    my $p_lo = $het_probs[$obs_hets];
#    for(my $i=$obs_hets-1; $i>=0; $i--) {
#	$p_lo += $het_probs[$i];
#    }
#
#    my $p_hi_lo;
#    if($p_hi < $p_lo) {
#	$p_hi_lo = 2 * $p_hi;
#    } else {
#	$p_hi_lo = 2 * $p_lo;
#    }

    # Initialise P-value 
    my $p_hwe = 0.0;

    # P-value calculation for p_hwe
    for(my $i = 0; $i <= $rare_copies; $i++) {
	if($het_probs[$i] > $het_probs[$obs_hets]) {
	    next;
	}
	$p_hwe += $het_probs[$i];
    }
    
    if($p_hwe > 1) {
	$p_hwe = 1.0;
    }

    return($p_hwe);
}
