#!/usr/bin/env perl

use strict;
use warnings;
use autodie;
use Getopt::Long;
use List::Util qw/shuffle/;

use constant {
	SEED             => 666,
	COHORT           => 100,
	WINDOW           => 1000,
	MAX_NNN_PERC     => 0.25,
	FIXED_PERC       => 0.25,
	POLYMORPHIC_PERC => 0.50,
	SOMATIC_PERC     => 0.25,
	OUTPUT_DIR       => 'out'
};

if (@ARGV < 1) {
	usage();
	exit 0;
}

my $seed = SEED;
my $cohort = COHORT;
my $dir = OUTPUT_DIR;

GetOptions(
	"seed=i"       => \$seed,
	"cohort=i"     => \$cohort,
	"output-dir=s" => \$dir
) or die "Error in command line arguments\n";

my ($genome, $rtc_file) = @ARGV;

unless ($genome && $rtc_file) {
	usage();
	exit 1;
}

srand $seed;

mkdir $dir unless -d $dir;

my $chr_h = index_fasta($genome);
my $rtc_h = index_rtc($rtc_file);
my $pos_h = build_pos($rtc_h, $chr_h);
my $cohort_a = build_cohort($rtc_h, $cohort);

dump_rtc_pos($pos_h, "$dir/rtc_pos.tsv");
dump_rtc_ind($rtc_h, $pos_h, $cohort_a, $dir);

sub usage {
	print "Usage: $0 [--cohort=INT] [--seed=INT] [--output-dir=DIR] <GENOME> <RTC_FILE>\n";
}

sub dump_rtc_pos {
	my ($pos_h, $file) = @_;
	open my $fh, ">", $file;
	print $fh "#chromosome\tposition\tplus_strand\tparental\thomozygous\n";
	for my $id (sort keys %$pos_h) {
		my $d_h = $pos_h->{$id};
		print $fh "$d_h->{chr}\t$d_h->{pos}\t$d_h->{plus_strand}\t$id\t$d_h->{homozygous}\n";
	}
	close $fh;
}

sub dump_rtc_ind {
	my ($rtc_h, $pos_h, $cohort_a, $dir) = @_;

	for my $i (0..$#$cohort_a) {
		my $ind = $cohort_a->[$i];

		open my $fh, ">", "$dir/ind${i}.sandy";
		print $fh "#seqid\tposition\tid\treference\talteration\tgenotype\n";

		for my $parental (@$ind) {
			my $d_h = $pos_h->{$parental};

			my $seq = $d_h->{plus_strand}
				? $rtc_h->{$parental}
				: _reverse_complement($rtc_h->{$parental});

			my $zygosity = $d_h->{homozygous} ? "HO" : "HE";

			my $id = $parental;
			$id .= $d_h->{plus_strand} ? "_p" : "_m";
			$id .= "_${zygosity}";

			print $fh "$d_h->{chr}\t$d_h->{pos}\t$id\t-\t$seq\t$zygosity\n";
		}

		close $fh;
	}
}

sub _reverse_complement {
	my $seq = shift;
	my $rev_seq = reverse $seq;
	$rev_seq =~ tr/ATCGatcg/TAGCtagc/;
	return $rev_seq;
}

sub index_fasta {
	my $genome = shift;
	open my $fh, "<" => $genome;
	my %chr;
	my $id;
	while (<$fh>) {
		chomp;
		next if /^;/;
		if (/^>/) {
			my @f = split /\|/;
			$id = $f[0];
			$id =~ s/^>//;
			$id =~ s/^\s+|\s+$//g;
		} else {
			die "Error reading fasta file '$genome': Not defined id"
				unless defined $id;
			$chr{$id} .= $_;
		}
	}
	close $fh;
	return \%chr;
}

sub index_rtc {
	my $rtc_file = shift;
	open my $fh, "<" => $rtc_file;
	my %rtc;
	while (<$fh>) {
		chomp;
		my @f = split;
		$rtc{$f[0]} = $f[-1];
	}
	close $fh;
	return \%rtc;
}

sub build_pos {
	my ($rtc_h, $chr_h) = @_;

	my @chrs = keys %$chr_h;
	my @chrs_len = map {length $chr_h->{$_}} @chrs;
	my $weights_a = _build_weights(\@chrs_len);

	my %pos;
	for my $id (keys %$rtc_h) {
		my $chr_i = _bsearch(
			int(rand($weights_a->[-1]{up} + 1)),
			$weights_a,
			0,
			$#$weights_a
		);
		die "No index for chr" unless defined $chr_i;
		my $chr = $chrs[$chr_i];

		my $seq_s = \$chr_h->{$chr};
		my $pos;
		do {{
			$pos = int(rand(length $$seq_s));
		}} while(_is_pos_inside_NNN($seq_s, $pos));

		my $plus_strand = int(rand(2));
		my $homozygous = int(rand(4)) == 3 ? 1 : 0;

		$pos{$id} = {
			chr         => $chr,
			pos         => $pos,
			plus_strand => $plus_strand,
			homozygous  => $homozygous
		};
	}

	return \%pos;
}

sub _build_weights {
	my $w_a = shift;
	my @offset;
	my $left = 0;
	for (my $i = 0; $i < @$w_a; $i++) {
		my %w = (
			down => $left,
			up   => $left + $w_a->[$i] - 1
		);
		$left += $w_a->[$i];
		push @offset => \%w;
	}
	return \@offset;
}

sub _bsearch {
	my ($key1, $base, $start, $end) = @_;
	if ($start > $end) {
		# Not found!
		return;
	}
	my $index = int(($start + $end) / 2);
	my $key2 = $base->[$index];
	# $key1 <=> $key2
	my $rc = _cmp($key1, $key2);
	if ($rc > 0) {
		return _bsearch($key1, $base, $index + 1, $end);
	} elsif ($rc < 0) {
		return _bsearch($key1, $base, $start, $index - 1);
	} else {
		return $index;
	}
}

sub _cmp {
	# State the function to compare at _bsearch
	my ($r, $w) = @_;
	if ($r  >= $w->{down} && $r  <= $w->{up}) {
		return 0;
	}
	elsif ($r > $w->{down}) {
		return 1;
	} else {
		return -1;
	}
}

sub _is_pos_inside_NNN {
	my ($seq_s, $pos) = @_;
	my $seq_len = length $$seq_s;
	my $start_pos = $pos - int(WINDOW / 2);
	if ($start_pos < 0) {
		$start_pos = 0;
	} elsif (($start_pos + WINDOW) > $seq_len) {
		$start_pos = $seq_len - WINDOW;
	}
	my $win = substr $$seq_s, $start_pos, WINDOW;
	my $NNN_acm = $win =~ tr/Nn/Nn/;
	return $NNN_acm > int(WINDOW * MAX_NNN_PERC)
		? 1
		: 0;
}

sub build_cohort {
	my ($rtc_h, $cohort) = @_;

	my @parental = shuffle sort keys %$rtc_h;
	my $parental_size = scalar @parental;

	my @fixed = splice @parental, 0,
		int(FIXED_PERC * $parental_size);
	my @polymorphic = splice @parental, 0,
		int(POLYMORPHIC_PERC * $parental_size);
	my @somatic = splice @parental, 0,
		int(SOMATIC_PERC * $parental_size);

	# FIXED
	my @cohort;
	push @cohort => [@fixed] for 1..$cohort;

	# POLYMORPHIC
	for my $ind (@cohort) {
		for my $gene (@polymorphic) {
			if (int(rand(4)) == 3) {
				push @$ind => $gene;
			}
		}
	}

	# SOMATIC
	my @putative_somatic = shuffle 0..$cohort - 1;
	my @somatic_inds = splice @putative_somatic, 0, scalar(@somatic);
	for my $i (0..$#somatic_inds) {
		my $ind = $cohort[$somatic_inds[$i]];
		push @$ind => $somatic[$i];
	}

	return \@cohort;
}
