#!/usr/bin/env perl

use strict;
use warnings;
use Carp;
use Vcf;

my $opts = parse_params();
do_consensus($opts);

exit;

#--------------------------------

sub error
{
    my (@msg) = @_;
    if ( scalar @msg )
    {
        croak @msg;
    }
    die
        "Usage: cat ref.fa | vcf-consensus [OPTIONS] in.vcf.gz > out.fa\n",
        "Options:\n",
        "   -h, -?, --help                   This help message.\n",
        "   -H, --haplotype <int>            Apply only variants for the given haplotype (1,2)\n",
        "   -i, --iupac-codes                Apply variants in the form of IUPAC ambiguity codes\n",
        "   -s, --sample <name>              If not given, all variants are applied\n",
        "Examples:\n",
        "   # Get the consensus for one region. The fasta header lines are then expected\n",
        "   # in the form \">chr:from-to\".\n",
        "   samtools faidx ref.fa 8:11870-11890 | vcf-consensus in.vcf.gz > out.fa\n",
        "\n";
}


sub parse_params
{
    my $opts = { };
    while (my $arg=shift(@ARGV))
    {
        if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
        if ( $arg eq '-s' || $arg eq '--sample' ) { $$opts{sample}=shift(@ARGV); next; }
        if ( $arg eq '-i' || $arg eq '--iupac-codes' ) { $$opts{iupac}=1; next; }
        if ( $arg eq '-H' || $arg eq '--haplotype' ) { $$opts{haplotype}=shift(@ARGV); next; }
        if ( -e $arg && !exists($$opts{vcf_file}) ) { $$opts{vcf_file}=$arg; next; }
        error("Unknown parameter \"$arg\". Run -h for help.\n");
    }

    if ( $$opts{iupac} )
    {
        delete($$opts{iupac});
        $$opts{iupac}{CT} = 'Y';
        $$opts{iupac}{TC} = 'Y';
        $$opts{iupac}{AG} = 'R';
        $$opts{iupac}{GA} = 'R';
        $$opts{iupac}{AT} = 'W';
        $$opts{iupac}{TA} = 'W';
        $$opts{iupac}{GC} = 'S';
        $$opts{iupac}{CG} = 'S';
        $$opts{iupac}{TG} = 'K';
        $$opts{iupac}{GT} = 'K';
        $$opts{iupac}{CA} = 'M';
        $$opts{iupac}{AC} = 'M';
        $$opts{iupac}{AA} = 'A';
        $$opts{iupac}{CC} = 'C';
        $$opts{iupac}{GG} = 'G';
        $$opts{iupac}{TT} = 'T';
    }

    if ( exists($$opts{haplotype}) && !exists($$opts{sample}) ) { error("Expected -s option with -H.\n"); }
    return $opts;
}

sub do_consensus
{
    my ($opts) = @_;
    
    my $vcf = Vcf->new(file=>$$opts{vcf_file});
    $vcf->parse_header;
    if ( exists($$opts{sample}) )
    {
        if ( !exists($$vcf{has_column}{$$opts{sample}}) ) { error("No such sample: $$opts{sample}"); }
        $$opts{vcf} = $vcf; 
        $$opts{sample_col} = $$vcf{has_column}{$$opts{sample}};
    }
    my $chrs = $vcf->get_chromosomes();
    my %chrs = map { $_=>0 } @$chrs;

    my ($chr,$vcf_pos,$warned,$vcf_line);
    while (my $line=<STDIN>)
    {
        if ( $line=~/^>([^:\s]+)/ ) 
        {
            $chr = $1;

            for my $line (@{$$vcf{buffer}}) { apply_variant($opts,$line); }
            flush_fa_buffer($opts,0);

            my $rest = $';
            if ( $rest=~/^:(\d+)-\d+$/ )
            {
                print STDERR "Looks as fasta file snippet, the sequence $chr starts at position $1\n";
                $$opts{fa_pos} = $1;
            }
            else
            {
                $$opts{fa_pos} = 1;
            }
            $$opts{fa_idx} = 0;
            $$opts{fa_frz} = 0;
            if ( exists($chrs{$chr}) ) { $chrs{$chr}=1; }
            my $region = $$opts{fa_pos} > 1 ? "$chr:$$opts{fa_pos}" : $chr;
            $vcf->open(region=>$region);
            print $line;
            next;
        }

        chomp($line);
        if ( !$$opts{case_known} )
        {
            if ( uc($line) eq $line ) { $$opts{case_known} = 'u'; }
            elsif ( lc($line) eq $line ) { $$opts{case_known} = 'l'; }
            else { $$opts{case_known} = 'u'; }
        }
        $$opts{fa_buf} .= $line;
        $$opts{fa_len} += length($line);

        while ( defined($vcf_line = $vcf->next_data_array()) )
        {
            # can the beginning of the buffer be printed?
            if ( $$opts{fa_pos}+$$opts{fa_len}-$$opts{fa_idx}<=$$vcf_line[1] )
            {
                $vcf->_unread_line($vcf_line);
                flush_fa_buffer($opts,60);
                last;
            }
            # is the buffer long enough?
            if ( $$opts{fa_pos}+$$opts{fa_len}-$$opts{fa_idx}<=$$vcf_line[1]+length($$vcf_line[3]) )
            {
                $vcf->_unread_line($vcf_line);
                last;
            }
            apply_variant($opts,$vcf_line);
        }

        if ( !defined $vcf_line ) { flush_fa_buffer($opts,60); }
    }
    flush_fa_buffer($opts,0);

    for my $chr (keys %chrs)
    {
        if ( !$chrs{$chr} ) { warn("The sequence \"$chr\" not found in the fasta file.\n"); }
    }
}

sub flush_fa_buffer
{
    my ($opts,$len) = @_;
    while ( $$opts{fa_len} && $$opts{fa_len}>=60 )
    {
        print substr($$opts{fa_buf},0,60,''), "\n";
        $$opts{fa_len} -= 60;
        $$opts{fa_pos} += 60 - $$opts{fa_idx};
        $$opts{fa_idx}  = 0;
    }
    if ( $len or !$$opts{fa_len} ) { return; }
    print $$opts{fa_buf},"\n";
    $$opts{fa_pos} += $$opts{fa_len}-$$opts{fa_idx};
    $$opts{fa_len} = 0;
    $$opts{fa_buf} = '';
    $$opts{fa_idx} = 0;
}

sub apply_variant
{
    my ($opts,$vline) = @_;

    if ( $$vline[4] eq '.' ) { return; }

    my $hap = exists($$opts{haplotype}) ? $$opts{haplotype} : 0;
    my $alt;
    if ( !exists($$opts{sample_col}) )
    {
        # No sample requested, applying all sites, first ALT
        my $idx;
        $alt = ($idx=index($$vline[4],','))==-1 ? $$vline[4] : substr($$vline[4],0,$idx);
        if ( exists($$opts{iupac}) && length($$vline[3])==1 && length($alt)==1 )
        {
            $alt = uc($$vline[3].$alt);
            if ( !exists($$opts{iupac}{$alt}) ) { error("No IUPAC code for \"$alt\"\n"); }
            $alt = $$opts{iupac}{$alt};
        }
    }
    else
    {
        my $igt = $$opts{vcf}->get_tag_index($$vline[8],'GT',':');
        if ( $igt==-1 ) { return; }
        my $gt = $$opts{vcf}->get_field($$vline[$$opts{sample_col}-1],$igt);
        my @als = $$opts{vcf}->split_gt($gt);
        if ( $hap )
        {
            # Note: we are not checking the phase or phase blocks, assuming the VCF is perfect
            if ( $hap <= @als && $als[$hap-1] ne '0' ) 
            { 
                $alt = $$opts{vcf}->get_field($$vline[4],$als[$hap-1]-1,','); 
            }
        }
        else
        {
            if ( exists($$opts{iupac}) && length($$vline[3])==1 )   # only for SNPs and with -i
            {
                my @alts;
                for my $al (@als)
                {
                    if ( $al eq '.' ) { last; }
                    if ( $al eq '0' ) { push @alts,uc($$vline[3]); }
                    else
                    {
                        $alt = $$opts{vcf}->get_field($$vline[4],$al-1,',');
                        push @alts, uc($alt);
                        if ( length($alt)!=1 ) { last; }
                    }
                }
                if ( @alts==2 )
                {
                    if ( !exists($$opts{iupac}{$alts[0].$alts[1]}) ) { error("No IUPAC code for \"$alts[0]/$alts[1]\"\n"); }
                    $alt = $$opts{iupac}{$alts[0].$alts[1]};
                }
                elsif ( length($alts[0])==1 )
                {
                    if ( !exists($$opts{iupac}{$alts[0].$alts[0]}) ) { error("No IUPAC code for \"$alts[0]/$alts[0]\"\n"); }
                    $alt = $$opts{iupac}{$alts[0].$alts[0]};
                }
            }
            else
            {
                for my $al (@als)
                {
                    if ( $al eq '0' or $al eq '.' ) { next; }
                    $alt = $$opts{vcf}->get_field($$vline[4],$al-1,',');
                    last;
                }
            }
        }
        if ( !defined $alt or $alt eq $$vline[3] ) { return; }
    } 

    if ( $$vline[1] <= $$opts{fa_frz} )
    {
        print STDERR "Note: Conflicting variants at (or near) $$vline[0]:$$vline[1], cannot apply both.\n";
        return;
    }

    my $pos = $$vline[1] - $$opts{fa_pos} + $$opts{fa_idx};
    if ( $pos<0 or $pos>=$$opts{fa_len} ) { error("FIXME: $$vline[0]:$$vline[1] .. $$opts{fa_pos},$pos,$$opts{fa_len},$$opts{fa_frz}\n"); }

    # Sanity check
    my $ref_len = length($$vline[3]);
    if ( $$vline[3] ne uc(substr($$opts{fa_buf},$pos,$ref_len)) ) 
    { 
        error(sprintf "The fasta sequence does not match the REF at $$vline[0]:$$vline[1]. %s(%s) in .fa, %s in .vcf, frz=%d\n", 
            substr($$opts{fa_buf},$pos,$ref_len),
            substr($$opts{fa_buf},$pos+1,$ref_len+5),
            $$vline[3], $$opts{fa_frz}?$$opts{fa_frz}:0);
    }
    if ( $$opts{case_known} eq 'l' ) { $alt = lc($alt); }

    my $alt_len = length($alt);
    substr($$opts{fa_buf},$pos,$ref_len,$alt);
    $$opts{fa_len} += $alt_len - $ref_len;
    $$opts{fa_pos} += $ref_len;     # position with respect to the original reference sequence
    $$opts{fa_idx} += $alt_len;     # position in the modified sequence
    $$opts{fa_frz}  = $$vline[1] + $ref_len - 1;      # freeze changes until this position
}


