#!/pro/bin/perl
# csv2xlsx: Convert csv to xlsx
# (m)'23 Copyright H.M.Brand 2007-2024
use 5.014000;
our $VERSION = "1.23 - 2023-11-14";
sub usage {
my $err = shift and select STDERR;
print <<"EOU";
usage: csv2xlsx [options] [-o <xlsx>] [file.csv]
csv2xlsx --help | --man | --info
-s <sep> use <sep> as separator char, auto-detect, default = ','
The string "tab" is allowed.
-e <esc> use <esc> as escape char, auto-detect, default = '"'
The string "undef" is allowed.
-q <quot> use <quot> as quotation char, default = '"'
The string "undef" will disable quotation.
-w <width> use <width> as default minimum column width default = 4
-o <xlsx> write output to file named <xlsx>, defaults
to input file name with .csv replaced with .xlsx
if from standard input, defaults to csv2xlsx.xlsx
-F allow formula's. Otherwise fields starting with
an equal sign are forced to string
--Fa=aaa Define formula action: none/die/croak/diag/empty/undef
--Ft Formula's will be stored as text (formula actions: none)
--Fd Formula's will cause a die
--Fc Formula's will cause a croak
--FD Formula's will cause a warning (this is the default)
--Fe Formula's will be replaced by the empty string
--Fu Formula's will be replaced with an undefined cell
-f force usage of <xlsx> if already exists (unlink before use)
-d <dtfmt> use <dtfmt> as date formats. Default = 'dd-mm-yyyy'
-C <C:fmt> use <fmt> as currency formats for currency <C>, no default
--font=F set default font (default Calibri)
--fs=N set font size (default 10)
'M' = 'mono', 'D' = 'DejaVu Sans', 'DM' = 'DejaVu Sans Mono'
-D cols only convert dates in columns <cols>.
Default is everywhere. -D0 is disable
-L N limit export to N rows
-u CSV is UTF8
--de Some CSV fields might be double-encoded. Try to fix that.
-m merge multiple CSV's into a single xlsx (separate sheets)
-o is required, all arguments should be existing files
--skip-empty Skip empty files on merge
-S <cp> Split CSV on COLUMNxPAT into separate sheets. See --man
or --info for options/features and examples. May repeat.
--sl=C Use column C as sheet label when splitting with -S
-v [<lvl>] verbosity (default = 1)
EOU
exit $err;
} # usage
use List::Util qw( first );
use Date::Calc qw( Delta_Days Days_in_Month );
use Encode qw( from_to );
use Getopt::Long qw(:config bundling passthrough noignorecase );
my $quo = '"';
my $esc = '"';
my $wdt = 4; # Default minimal column width
my $fac = "diag"; # Formula action (default is warn only)
my $dtf = "dd-mm-yyyy"; # Date format
my $crf = ""; # Currency format, e.g.: $:### ### ##0.00
my $opt_v = 1;
my $dtc;
GetOptions (
"V|version" => sub { say $0 =~ s{.*/}{}r, " [$VERSION]"; exit 0; },
"help|?" => sub { usage (0); },
"man" => sub { pod_nroff (); },
"info" => sub { pod_text (); },
"c|s|sep=s" => \my $sep, # Set after reading first line in attempt to auto-detect
"q|quo=s" => \$quo,
"e|esc=s" => \$esc,
"w|width=i" => \$wdt,
"o|x|out=s" => \my $xls,
"d|date-fmt=s" => \$dtf,
"D|date-col=s" => \$dtc,
"C|curr-fmt=s" => \$crf,
"font=s" => \my $font,
"fs|font-size=i" => \my $font_size,
"f|force!" => \my $frc,
"F|formulas!" => \my $frm,
"Fa=s" => \$fac,
"Ft" => sub { $fac = "none"; },
"Fd" => sub { $fac = "die"; },
"Fc" => sub { $fac = "croak"; },
"FD" => sub { $fac = "diag"; },
"Fe" => sub { $fac = "empty"; },
"Fu" => sub { $fac = "undef"; },
"u|utf-8|utf8!" => \my $utf,
"de|fix-utf8!" => \my $dutf, # double encoded? \x{c3}\x{ab} => \x{100}
"m|merge!" => \my $mrg,
"skip-empty!" => \my $skip_empty,
"S|split=s" => \my @split,
"sl|split-label=s"=> \my $split_sl,
"L|row-limit=i" => \my $row_limit,
"v|verbose:2" => \$opt_v,
) or usage (1);
sub pod_text {
my $m = $ENV{NO_COLOR} ? "Pod::Text" : "Pod::Text::Color";
my $p = $m->new ();
open my $fh, ">", \my $out or die "Cannot generate manual: $!\n";
$p->parse_from_file ($0, $fh);
close $fh;
print $out;
exit 0;
} # pod_text
sub pod_nroff {
first { -x "$_/nroff" } grep { -d } split m/:+/ => $ENV{PATH} or pod_text ();
require Pod::Man;
my $p = Pod::Man->new ();
open my $fh, "|-", "nroff", "-man" or die "Cannot generate manual: $!\n";
$p->parse_from_file ($0, $fh);
close $fh;
exit 0;
} # pod_nroff
if ($mrg) {
my @csv;
for (@ARGV) {
if (m/\.xlsx?$/i) {
$xls and usage (1);
$xls = $_;
next;
}
if (m/\.(?:csv|png|jpe?g|bmp|gif|tiff|xpm)$/i && -s) {
push @csv => $_;
next;
}
unless ($skip_empty) {
warn "Argument $_ is not an existing (CSV) file\n";
usage (1);
}
}
$xls && @csv or usage (1);
@ARGV = @csv;
}
sub col2col {
my $l = shift;
$l =~ m/^[0-9]/ and return $l;
my $c = 0;
while ($l =~ s/^([A-Za-z])//) {
$c = 26 * $c + 1 + ord (uc $1) - ord ("A");
}
$c;
} # col2col
foreach my $split (@split) {
my ($col, $operator, $pat) = ($split =~ m{^
([0-9]+|[A-Z]+|[a-z]+) # Column: A, AB, 1, 14
([=/uU<>]) # Operator
(.*) # Pattern/string
\z}x) or usage (1);
my $case = $col =~ m/^[a-z]/ ? 1 : 0;
$split = {
col => col2col ($col),
op => $operator,
str => $pat,
ic => $case,
lbl => undef,
};
if ($split_sl) {
($col) = ($split_sl =~ m/^([0-9]+|[A-Z]+)$/) or usage (1);
$split->{lbl} = col2col ($col);
}
}
my $base = @ARGV && -f $ARGV[0] ? $ARGV[0] : "csv2xlsx";
$xls ||= $base =~ s/(?:\.csv)?$/.xlsx/ir;
-s $xls && $frc and unlink $xls;
if (-s $xls) {
print STDERR "File '$xls' already exists. Overwrite? [y/N] > N\b";
scalar <STDIN> =~ m/^[yj](?:es|a)?$/i or exit;
}
# Test if the file can be created
# The error from Excel::Writer::XLSX is not very informative if it fails
# e.g. when the folder cannot be used for whatever reason
{ open my $fh, ">", $xls or die "$xls: $!\n";
close $fh;
unlink $xls;
}
if ($dutf) {
eval { require Encode::DoubleEncodedUTF8; };
if ($@) {
$dutf = 0;
warn "Cannot load Encode::DoubleEncodedUTF8; --de ignored\n";
}
}
my $wbk = Excel::Writer::XLSX->new ($xls);
$dtf =~ s/j/y/g;
my %fmt = (
dflt => $wbk->add_format (),
date => $wbk->add_format (align => "center", num_format => $dtf),
rest => $wbk->add_format (align => "left"),
wrap => $wbk->add_format (text_wrap => 1),
);
$crf =~ s/^([^:]+):(.*)/$1/ and $fmt{currency} = $wbk->add_format (
num_format => "$1 $2",
align => "right",
);
if ($font) {
$font eq "M" and $font = "mono";
$font eq "D" and $font = "DejaVu Sans";
$font eq "DM" and $font = "DejaVu Sans Mono";
$_->set_font ($font) for values %fmt;
}
if ($font_size) {
$_->set_size ($font_size) for values %fmt;
}
my @args = @ARGV ? @ARGV : ("");
foreach my $csvf (@args) {
my $sheetname = $csvf =~ s{\.\w+$}{}ir =~ s{.*/}{}r || "Sheet 1";
($_ = length $sheetname) > 31 and substr $sheetname, 31, $_ - 31, "";
my ($wks, $w, $h);
$opt_v > 7 and warn "Parsing $csvf into $xls.$sheetname ...\n";
if ($csvf =~ m/\.(png|jpe?g|bmp|gif|tiff|xpm)$/i) {
$wks = $wbk->add_worksheet ($sheetname);
$wks->insert_image (1, 1, $csvf);
next;
}
my $row;
my $firstline;
my $fh;
if (-f $csvf) {
$opt_v and say "Reading $csvf";
open $fh, "<", $csvf or die "$csvf: $!\n";
}
else {
$opt_v and say "Reading STDIN";
$fh = *ARGV;
}
my $Sep = $sep;
unless ($Sep) { # No sep char passed, try to auto-detect;
while (<$fh>) {
m/\S/ or next; # Skip empty leading blank lines
$Sep = # start auto-detect with quoted strings
m/["\d];["\d;]/ ? ";" :
m/["\d],["\d,]/ ? "," :
m/["\d]\t["\d,]/ ? "\t" :
# If neither, then for unquoted strings
m/\w;[\w;]/ ? ";" :
m/\w,[\w,]/ ? "," :
m/\w\t[\w,]/ ? "\t" :
# And pipes (lowest prio)
m/["\d]\|["\d,]/ ? "|" :
m/\w\|[\w,]/ ? "|" :
m/,/ && !m/[;\t]/ ? "," :
m/;/ && !m/[,\t]/ ? ";" :
m/\t/ && !m/[,;]/ ? "\t" :
"," ;
$firstline = $_;
last;
}
$firstline or die "The sourcefile does not contain any usable data\n";
}
my $csv = Text::CSV_XS-> new ({
sep_char => $Sep eq "tab" ? "\t" : $Sep,
quote_char => $quo eq "undef" ? undef : $quo,
escape_char => $esc eq "undef" ? undef : $esc,
binary => 1,
keep_meta_info => 1,
auto_diag => 1,
formula => $fac,
});
if ($firstline) {
$csv->parse ($firstline) or die $csv->error_diag ();
$row = [ $csv->fields ];
}
if ($opt_v > 3) {
foreach my $k (qw( sep_char quote_char escape_char )) {
my $c = $csv->$k () || "undef";
$c =~ s/\t/\\t/g;
$c =~ s/\r/\\r/g;
$c =~ s/\n/\\n/g;
$c =~ s/\0/\\0/g;
$c =~ s/([\x00-\x1f\x7f-\xff])/sprintf"\\x{%02x}",ord$1/ge;
printf STDERR "%-11s = %s\n", $k, $c;
}
}
if (length $dtc) {
if ($dtc eq "0") {
$dtc = { -2 => 0 };
}
else {
my $rows = $dtc;
$rows =~ s/-$/-999/; # 3,6-
$rows =~ s/-/../g;
eval "\$dtc = { map { \$_ => 1 } $rows }";
}
}
my @w; # data height, -width, and default column widths
while ($row && @$row or $row = $csv->getline ($fh)) {
$row_limit and $csv->record_number > $row_limit and last;
my @row = @$row;
$opt_v > 8 and warn "@row\n";
if (@split) {
my $ns = 0;
foreach my $split (@split) {
my ($i, $op, $str, $case) = @{$split}{qw( col op str ic )};
my $v = $row[$i - 1]; # Column-index is one-based
if (defined $v) {
$op eq "U" and $ns += 1;
if ($case) {
$op eq "=" and $ns += ( lc $v eq lc $v );
$op eq "/" and $ns += ( $v =~ m/$str/i );
}
else {
$op eq "=" and $ns += ( $v eq $str );
$op eq "/" and $ns += ( $v =~ m/$str/ );
if ($v =~ m/^[0-9]+$/) {
$op eq "<" and $ns += ( $v < $str );
$op eq ">" and $ns += ( $v > $str );
}
else {
$op eq "<" and $ns += ( $v lt $str );
$op eq ">" and $ns += ( $v gt $str );
}
}
}
else {
$op eq "u" and $ns += 1;
}
}
$opt_v > 8 and warn join " " => "Record", $csv->record_number,
"matched", $ns, "out of", scalar @split, "criteria\n";
if ($ns == @split) { # All criteria PASS
if (@w) {
$wks->set_column ($_, $_, $w[$_]) for 0 .. $#w;
$wks = undef;
}
$split[0]{lbl} and $sheetname = $row[$split[0]{lbl} - 1];
}
}
unless ($wks) {
$wks = $wbk->add_worksheet ($sheetname);
$utf && !$wks->can ("write_unicode") and $utf = 0;
($h, $w, @w) = (0, 1);
}
@row > $w and push @w => ($wdt) x (($w = @row) - @w);
foreach my $c (0 .. $#row) {
my $val = $row[$c] // "";
my $l = length $val;
$l > ($w[$c] // -1) and $w[$c] = $l;
$dutf and $csv->is_binary ($c) and utf8::valid ($val) and
$val = Encode::decode ("utf-8-de", $val);
if ($utf and $csv->is_binary ($c)) {
from_to ($val, "utf-8", "ucs2");
$wks->write_unicode ($h, $c, $val);
next;
}
if ($csv->is_quoted ($c)) {
$val =~ s/\r\n/\n/g;
if ($utf) {
from_to ($val, "utf-8", "ucs2");
$val =~ m/\n/
? $wks->write_unicode ($h, $c, $val, $fmt{wrap})
: $wks->write_unicode ($h, $c, $val, $fmt{dflt});
}
else {
$val =~ m/\n/
? $wks->write_string ($h, $c, $val, $fmt{wrap})
: $wks->write_string ($h, $c, $val, $fmt{dflt});
}
next;
}
if (!$dtc or $dtc->{$c + 1}) {
my @d = (0, 0, 0); # Y, M, D
$val =~ m/^(\d{4})(\d{2})(\d{2})$/ and @d = ($1, $2, $3);
$val =~ m/^(\d{2})-(\d{2})-(\d{4})$/ and @d = ($3, $2, $1);
if ( $d[2] >= 1 && $d[2] <= 31 &&
$d[1] >= 1 && $d[1] <= 12 &&
$d[0] >= 1900 && $d[0] <= 2199) {
my $dm = Days_in_Month (@d[0,1]);
$d[2] < 1 and $d[2] = 1;
$d[2] > $dm and $d[2] = $dm;
my $dt = 2 + Delta_Days (1900, 1, 1, @d);
$wks->write ($h, $c, $dt, $fmt{date});
next;
}
}
if ($crf and $val =~ m/^\s*\Q$crf\E\s*([0-9.]+)$/) {
$wks->write ($h, $c, $1 + 0, $fmt{currency});
next;
}
if (!$frm && $val =~ m/^=/) {
$wks->write_string ($h, $c, $val);
}
else {
$wks->write ($h, $c, $val, $fmt{dflt});
}
}
++$h % 100 or $opt_v && printf STDERR "%6d x %6d\r", $w, $h;
} continue { $row = undef }
close $fh;
$opt_v && printf STDERR "%6d x %6d\n", $w, $h;
$wks->set_column ($_, $_, $w[$_]) for 0 .. $#w;
}
$opt_v and say "Writing $xls";
$wbk->close ();
__END__
=head1 NAME
csv2xlsx - Convert CSV to Excel 2007+
=head1 SYNOPSIS
csv2xlsx [options] [file.csv]
csv2xlsx test.csv
csv2xlsx -f -o merged.xlsx -m foobar*.csv
=head1 DESCRIPTION
This tool converts CSV data to Excel-2007+. It can convert the CSV into a
single sheet, merge CSV files into multiple sheets in a workbook or split
a single CSV into multiple sheets.
The tool supports encoding, formula handle, date conversion and some more.
=head1 OPTIONS
=over 2
=item -s S
=item --sep=S
use C<S> as separator character, auto-detect, default = C<,>.
The literal string C<tab> is allowed.
=item -q Q
=item --quo=Q
use C<Q> as quotation character, auto-detect, default = C<">.
The literal string C<undef> is allowed to disable quotation.
=item -e E
=item --esc=E
use C<E> as escape character, auto-detect, default = C<">.
The literal string C<undef> is allowed to disable escapes.
=item -w W
=item --width=W
Set column width. Default is to auto-size per column per sheet with a
minimum width of C<4>.
=item -o FILE.xlsx
=item -x FILE.xlsx
=item --out=FILE.xlsx
Specify the output filename. Default it the same name as the input-file
where the trailing C<.csv> will be replaced with C<.xlsx>.
If a filename can not be automatically generated, it will default to
C<csv2xlsx.xlsx>.
=item -d format
=item --date-fmt=format
Use C<format> as date formats. Default = C<dd-mm-yyyy>.
=item -D range
=item --date-col=range
Only convert dates in columns C<range>. Default is everywhere.
Ranges are numeric, where the first column has index C<1>. A range is a
definition of sections joined by C<,>s. A section is either a single column
or a start/finish-pair joined with a dash. A missing finish index on the
last segment is an open range (till the end of the line).
If the range is C<0>, date conversion is disabled.
-D 0
-D 1,2,6
-D 2,4-8,12-
=item -C format
=item --curr-fmt=format
Use C<format> as currency formats for currency <C>, no default.
-C '$:### ### ##0.00'
=item --font=Calibri
You can set the default font. The XLSX writer used sets the default font
face to C<Calibri>.
--font=Arial
There are 3 default shortcuts
--font=M
--font=D
--font=DM
Setting the font to C<mono>, C<DejaVu Sans>, or C<DejaVu Sans Mono>.
=item --fs=10
=item --font-size=10
Optional set the default font-size to anything other than the default 10.
Must be a positive integer. No checks.
=item -f
=item --force
Force usage of the output file if it already exists (unlink before use).
=item -F
=item --formulas
Allow formula's. Otherwise fields starting with an equal sign C<=> are
forced to string type.
There are several shortcuts here to specify different behavior:
=over 2
=item --Fa=a
Define formula action: C<none>/C<die>/C<croak>/C<diag>/C<empty>/C<undef>
=item --Ft
Formula's will be stored as text (formula actions: none)
=item --Fd
Formula's will cause a die
=item --Fc
Formula's will cause a croak
=item --FD
Formula's will cause a warning (this is the default)
=item --Fe
Formula's will be replaced by the empty string
=item --Fu
Formula's will be replaced with an undefined cell
=back
=item -u
=item --utf-8
=item --utf8
CSV is UTF-8 encoded. Likely not needed, as most is auto-detected.
=item --de
=item --fix-utf8
Attempt to fix double-encoded UTF-8. e.g. C<\x{00c3}\x{00ab}> should have
been C<\x0100}>. YMMV.
=item -m
=item --merge
Merge multiple CSV's into a single Excel (separate sheets).
With this option, the option C<-o> is required. All arguments should be
existing files. Piping is not supported.
=item --skip-empty
When merging CSV files with C<-m>, skip empty files. The default is to exit
with an error message.
=item -L N
=item --row-limit=N
Limit export to C<N> rows.
=item -S CxP
=item --split=CxP
When dealing with big CSV datasets, this option enables you to split the
data over several sheets. When all the C<-S> options match in a single row,
that row will be the first row of a new sheet. (see also C<--sl=C>)
=over 2
=item C
The column that should be examined. C<A> = C<1>. If lower case, the value
of that column is matched case insensitive when appropriate.
-S 7=ab Column G is literal "ab"
-S G=ab Column G is literal "ab"
-S g=ab Column G is literal "ab", or "aB", or "Ab", or "AB"
=item x
Defines the operation on the column
=over 2
=item =
Literal match
-S G=ab Column G is literal "ab"
-S g=ab Column G is literal "ab", or "aB", or "Ab", or "AB"
=item /
Regex match
-S G/b[a-z] Column G matches /b[a-z]/
-S g/b[a-z] Column G matches /b[a-z]/i
=item u U
Check for defined
-S Gu Column G is undefined
-S GU Column G is defined
Similar for emptiness
-S G= Column G is defined but empty
-S G/. Column G is defined and not empty
=item < >
Compare. If the value in the column is defined compare. If the values only
holds digits, do a numeric compare, otherwise do a string compare.
-S G<42 Column G is defined and less than 42 (numeric)
-S G<ab Column G is defined and less than 42 (string)
-S G>42 Column G is defined and greater than 42 (numeric)
-S G>ab Column G is defined and greater than 42 (string)
=back
=item P
pattern or literal string. Quotation might be required differing per OS and
shell-environment.
=back
=item --sl=C
When splitting with C<-S> / C<--split>, if all criteria match and a new
sheet is to be created, use the value in column C<C> of the matching row
as the new sheet label.
=item -v [V]
=item --verbose[=V]
Set verbosity level. Default = 1. No argument will set to 2.
=back
=head1 SEE ALSO
csv2xls - Convert CSV to old Excel
=head1 AUTHOR
H.Merijn Brand
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2016-2023 H.Merijn Brand. All rights reserved.
This library is free software; you can redistribute and/or modify it under
the same terms as Perl itself.
=cut