The Perl Toolchain Summit needs more sponsors. If your company depends on Perl, please support this very important event.

next_data_hash

    About   : Reads next VCF line and splits it into a hash. This is the slowest way to obtain the data.
    Usage   : my $vcf = VCF->new();
              $vcf->parse_header();
              my $x = $vcf->next_data_hash();

              # Or having a VCF data line $line
              my $x = $vcf->next_data_hash($line);

    Args    : Optional line to parse.

parse_header

    About   : Reads (and stores) the VCF header.
    Usage   : my $vcf = VCF->new(); $vcf->parse_header();
    Args    : silent .. do not warn about duplicate header lines

_next_header_line

    About   : Stores the header lines and meta information, such as fields types, etc.
    Args    : silent .. do not warn about duplicate column names

get_header_line

    Usage   : $vcf->get_header_line(key=>'INFO', ID=>'AC')
              $vcf->get_header_line(key=>'FILTER', ID=>'q10')
              $vcf->get_header_line(key=>'reference')
              $vcf->get_header_line(key=>'contig',ID=>'20')
    Args    : Header line filter as in the example above
    Returns : List ref of header line hashes matching the filter

add_header_line

    Usage   : $vcf->add_header_line({key=>'INFO', ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'})
              $vcf->add_header_line({key=>'reference',value=>'1000GenomesPilot-NCBI36'})
    Args    : Header line hash as in the example above
              Hash with additional parameters [optional]
                silent .. do not warn about existing header keys
                append .. append timestamp to the name of the new one
    Returns :

remove_header_line

    Usage   : $vcf->remove_header_line(key=>'INFO', ID=>'AC')
    Args    :
    Returns :

parse_header_line

    Usage   : $vcf->parse_header_line(q[##reference=1000GenomesPilot-NCBI36])
              $vcf->parse_header_line(q[##INFO=NS,1,Integer,"Number of Samples With Data"])
    Args    :
    Returns :

_read_column_names

    About   : Stores the column names as array $$self{columns} and hash $$self{has_column}{COL_NAME}=index.
              The indexes go from 1.
    Usage   : $vcf->_read_column_names();
    Args    : none

_fake_column_names

    About   : When no header is present, fake column names as the default mandatory ones + numbers
    Args    : The number of genotype columns; 0 if no genotypes but FORMAT present; <0 if FORMAT and genotypes not present

format_header

    About   : Returns the header.
    Usage   : print $vcf->format_header();
    Args    : The columns to include on output [optional]

format_line

    About   : Returns the header.
    Usage   : $x = $vcf->next_data_hash(); print $vcf->format_line($x);
              $x = $vcf->next_data_array(); print $vcf->format_line($x);
    Args 1  : The columns or hash in the format returned by next_data_hash or next_data_array.
         2  : The columns to include [optional]

recalc_ac_an

    About   : Control if the AC and AN values should be updated.
    Usage   : $vcf->recalc_ac_an(1); $x = $vcf->next_data_hash(); print $vcf->format_line($x);
    Args 1  : 0 .. never recalculate
              1 .. recalculate if present
              2 .. recalculate if present and add if missing

get_tag_index

    Usage   : my $idx = $vcf->get_tag_index('GT:PL:DP:SP:GQ','PL',':');
    Arg 1   : Field
        2   : The tag to find
        3   : Tag separator
    Returns : Index of the tag or -1 when not found

remove_field

    Usage   : my $field = $vcf->remove_field('GT:PL:DP:SP:GQ',1,':');    # returns 'GT:DP:SP:GQ'
    Arg 1   : Field
        2   : The index of the field to remove
        3   : Field separator
    Returns : Modified string

replace_field

    Usage   : my $col = $vcf->replace_field('GT:PL:DP:SP:GQ','XX',1,':');    # returns 'GT:XX:DP:SP:GQ'
    Arg 1   : Field
        2   : Replacement
        3   : 0-based index of the field to replace
        4   : Field separator
    Returns : Modified string

get_info_field

    Usage   : my $line  = $vcf->next_line;
              my @items = split(/\t/,$line);
              $af = $vcf->get_info_field('DP=14;AF=0.5;DB','AF');    # returns 0.5
              $af = $vcf->get_info_field('DP=14;AF=0.5;DB','DB');    # returns 1
              $af = $vcf->get_info_field('DP=14;AF=0.5;DB','XY');    # returns undef
    Arg 1   : The VCF line broken into an array
        2   : The tag to retrieve
    Returns : undef when tag is not present, the tag value if present, or 1 if flag is present

get_field

    Usage   : my $line  = $vcf->next_line;
              my @items = split(/\t/,$line);
              my $idx = $vcf->get_tag_index($$line[8],'PL',':');
              my $pl  = $vcf->get_field($$line[9],$idx) unless $idx==-1;
    Arg 1   : The VCF line broken into an array
        2   : The index of the field to retrieve
        3   : The delimiter [Default is ':']
    Returns : The tag value

get_sample_field

    Usage   : my $line  = $vcf->next_line;
              my @items = split(/\t/,$line);
              my $idx = $vcf->get_tag_index($$line[8],'PL',':');
              my $pls = $vcf->get_sample_field(\@items,$idx) unless $idx==-1;
    Arg 1   : The VCF line broken into an array
        2   : The index of the field to retrieve
    Returns : Array of values

split_mandatory

    About   : Faster alternative to regexs, extract the mandatory columns
    Usage   : my $line=$vcf->next_line; my @cols = $vcf->split_mandatory($line);
    Arg     :
    Returns : Pointer to the array of values

split_gt

    About   : Faster alternative to regexs
    Usage   : my ($a1,$a2,$a3) = $vcf->split_gt('0/0/1'); # returns (0,0,1)
    Arg     : Diploid genotype to split into alleles
    Returns : Array of values

split_by

    About   : Generalization of split_gt
    Usage   : my ($a1,$a2,$a3) = $vcf->split_gt('0/0|1',qw(| /)); # returns (0,0,1)
    Arg     : Diploid genotype to split into alleles
    Returns : Array of values

decode_genotype

    About   : Faster alternative to regexs
    Usage   : my $gt = $vcf->decode_genotype('G',['A','C'],'0/0'); # returns 'G/G'
    Arg   1 : Ref allele
          2 : Alt alleles
          3 : The genotype to decode
    Returns : Decoded GT string

validate_alt_field

    Usage   : my $x = $vcf->next_data_hash(); $vcf->validate_alt_field($$x{ALT});
    Args    : The ALT arrayref
    Returns : Error message in case of an error.

event_type

    Usage   :   my $x = $vcf->next_data_hash();
                my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001');
                for my $allele (@$alleles)
                {
                    my ($type,$len,$ht) = $vcf->event_type($x,$allele);
                }
              or
                my ($type,$len,$ht) = $vcf->event_type($ref,$al);
    Args    : VCF data line parsed by next_data_hash or the reference allele
            : Allele
    Returns :   's' for SNP and number of SNPs in the record
                'i' for indel and a positive (resp. negative) number for the length of insertion (resp. deletion)
                'r' identical to the reference, length 0
                'o' for other (complex events) and the number of affected bases
                'b' breakend
                'u' unknown

has_AGtags

    About   : Checks the header for the presence of tags with variable number of fields (Number=A or Number=G, such as GL)
    Usage   : $vcf->parse_header(); my $agtags = $vcf->has_AGtags();
    Args    : None
    Returns : Hash {fmtA=>[tags],fmtG=>[tags],infoA=>[tags],infoG=>[tags]} or undef if none is present

parse_AGtags

    About   : Breaks tags with variable number of fields (that is where Number is set to 'A' or 'G', such as GL) into hashes
    Usage   : my $x = $vcf->next_data_hash(); my $values = $vcf->parse_AGtags($x);
    Args    : VCF data line parsed by next_data_hash
            : Mapping between ALT representations based on different REFs [optional]
            : New REF [optional]
    Returns : Hash {Allele=>Value}

format_AGtag

    About   : Format tag with variable number of fields (that is where Number is set to 'A' or 'G', such as GL)
    Usage   :
    Args    :
            :
            :
    Returns :

parse_alleles

    About   : Deprecated, use parse_haplotype instead.
    Usage   : my $x = $vcf->next_data_hash(); my ($al1,$sep,$al2) = $vcf->parse_alleles($x,'NA00001');
    Args    : VCF data line parsed by next_data_hash
            : The genotype column name
    Returns : Alleles and the separator. If only one allele is present, $sep and $al2 will be an empty string.

parse_haplotype

    About   : Similar to parse_alleles, supports also multiploid VCFs.
    Usage   : my $x = $vcf->next_data_hash(); my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001');
    Args    : VCF data line parsed by next_data_hash
            : The genotype column name
    Returns : Two array refs and two boolean flags: List of alleles, list of separators, and is_phased/empty flags. The values
                can be cashed and must be therefore considered read only!

format_haplotype

    Usage   : my ($alleles,$seps,$is_phased,$is_empty) = $vcf->parse_haplotype($x,'NA00001'); print $vcf->format_haplotype($alleles,$seps);

format_genotype_strings

    Usage   : my $x = { REF=>'A', gtypes=>{'NA00001'=>{'GT'=>'A/C'}}, FORMAT=>['GT'], CHROM=>1, POS=>1, FILTER=>['.'], QUAL=>-1 };
              $vcf->format_genotype_strings($x);
              print $vcf->format_line($x);
    Args 1  : VCF data line in the format as if parsed by next_data_hash with alleles written as letters.
         2  : Optionally, a subset of columns can be supplied. See also format_line.
    Returns : Modifies the ALT array and the genotypes so that ref alleles become 0 and non-ref alleles
                numbers starting from 1. If the key $$vcf{trim_redundant_ALTs} is set, ALT alleles not appearing
                in any of the sample column will be removed.

format_header_line

    Usage   : $vcf->format_header_line({key=>'INFO', ID=>'AC',Number=>-1,Type=>'Integer',Description=>'Allele count in genotypes'})
    Args    :
    Returns :

remove_columns

    Usage   : my $rec=$vcf->next_data_hash(); $vcf->remove_columns($rec,remove=>['NA001','NA0002']);
    Args    : VCF hash pointer
            : list of columns to remove or a lookup hash with column names to keep (remove=>[] or keep=>{})
    Returns :

add_columns

    Usage   : $vcf->add_columns('NA001','NA0002');
    Args    :
    Returns :

add_format_field

    Usage   : $x=$vcf->next_data_hash(); $vcf->add_format_field($x,'FOO'); $$x{gtypes}{NA0001}{FOO}='Bar'; print $vcf->format_line($x);
    Args    : The record obtained by next_data_hash
            : The field name
    Returns :

remove_format_field

    Usage   : $x=$vcf->next_data_hash(); $vcf->remove_format_field($x,'FOO'); print $vcf->format_line($x);
    Args    : The record obtained by next_data_hash
            : The field name
    Returns :

add_info_field

    Usage   : $x=$vcf->next_data_array(); $$x[7]=$vcf->add_info_field($$x[7],'FOO'=>'value','BAR'=>undef,'BAZ'=>''); print join("\t",@$x)."\n";
    Args    : The record obtained by next_data_array
            : The INFO field name and value pairs. If value is undef and the key is present in $$x[7],
                it will be removed. To add fields without a value, use empty string ''.
    Returns : The formatted INFO.

add_filter

    Usage   : $x=$vcf->next_data_array(); $$x[6]=$vcf->add_filter($$x[6],'SnpCluster'=>1,'q10'=>0); print join("\t",@$x)."\n";
    Args    : The record obtained by next_data_array or next_data_hash
            : The key-value pairs for filter to be added. If value is 1, the filter will be added. If 0, the filter will be removed.
    Returns : The formatted filter field.

validate_filter_field

    Usage   : my $x = $vcf->next_data_hash(); $vcf->validate_filter_field($$x{FILTER});
    Args    : The FILTER arrayref
    Returns : Error message in case of an error.

validate_header

    About   : Version specific header validation code.
    Usage   : my $vcf = VCF->new(); $vcf->parse_header(); $vcf->validate_header();
    Args    :

validate_line

    About   : Version specific line validation code.
    Usage   : my $vcf = VCF->new(); $vcf->parse_header(); $x = $vcf->next_data_hash; $vcf->validate_line($x);
    Args    :

validate_info_field

    Usage   : my $x = $vcf->next_data_hash(); $vcf->validate_info_field($$x{INFO},$$x{ALT});
    Args    : The INFO hashref
    Returns : Error message in case of an error.

validate_gtype_field

    Usage   : my $x = $vcf->next_data_hash(); $vcf->validate_gtype_field($$x{gtypes}{NA00001},$$x{ALT},$$x{FORMAT});
    Args    : The genotype data hashref
              The ALT arrayref
    Returns : Error message in case of an error.

run_validation

    About   : Validates the VCF file.
    Usage   : my $vcf = VCF->new(file=>'file.vcf'); $vcf->run_validation('example.vcf.gz');
    Args    : File name or file handle.

get_chromosomes

    About   : Get list of chromosomes from the VCF file. Must be bgzipped and tabix indexed.
    Usage   : my $vcf = VCF->new(); $vcf->get_chromosomes();
    Args    : none

get_samples

    About   : Get list of samples.
    Usage   : my $vcf = VCF->new(); $vcf->parse_header(); my (@samples) = $vcf->get_samples();
    Args    : none

get_column

    About   : Convenient way to get data for a sample
    Usage   : my $rec = $vcf->next_data_array(); my $sample_col = $vcf->get_column($rec, 'NA0001');
    Args 1  : Array pointer returned by next_data_array
         2  : Column/Sample name

get_column_name

    About   : Mapping between zero-based VCF column and its name
    Usage   : my $vcf = VCF->new(); $vcf->parse_header(); my $name = $vcf->get_column_name(1); # returns POS
    Args    : Index of the column (0-based)

get_column_index

    About   : Mapping between VCF column name and its zero-based index
    Usage   : my $vcf = VCF->new(); $vcf->parse_header(); my $name = $vcf->get_column_index('POS'); # returns 1
    Args    : Name of the column