The Perl Toolchain Summit 2025 Needs You: You can help 🙏 Learn more

# Copyright (c) 2024 Yuki Kimoto
# MIT License
class R::DataFrame {
version_from R;
use StringList;
use List;
use R::NDArray;
use R::DataFrame::Column;
# Fields
has colobjs_list : List of R::DataFrame::Column;
has colobjs_indexes_h : Hash of Int;
# Class Methods
static method new : R::DataFrame () {
my $self = new R::DataFrame;
my $colobjs_list = List->new(new R::DataFrame::Column[0]);
$self->{colobjs_list} = $colobjs_list;
my $colobjs_indexes_h = Hash->new;
$self->{colobjs_indexes_h} = $colobjs_indexes_h;
return $self;
}
# Instance Methods
method colnames : string[] () {
my $ncol = $self->ncol;
my $colnames = new string[$ncol];
for (my $i = 0; $i < $ncol; $i++) {
$colnames->[$i] = $self->colname($i);
}
return $colnames;
}
method exists_col : int ($colname : string) {
my $colobjs_indexes_h = $self->{colobjs_indexes_h};
my $exists = $colobjs_indexes_h->exists($colname);
return $exists;
}
method colname : string ($col : int) {
my $colobjs_list = $self->{colobjs_list};
unless ($col >= 0 && $col < $colobjs_list->length) {
die "The column at index $col dose not exist.";
}
my $colobj = (R::DataFrame::Column)$colobjs_list->get($col);
my $colname = $colobj->{name};
return $colname;
}
method colindex : int ($colname : string) {
my $ncol = $self->ncol;
my $match = 0;
for (my $col = 0; $col < $ncol; $col++) {
my $colname_cmp = $self->colname($col);
if ($colname eq $colname_cmp) {
return $col;
}
}
die "The column named \"$colname\" does not exists.";
}
method col_by_index : R::NDArray ($col : int) {
my $colobjs_list = $self->{colobjs_list};
unless ($col >= 0 && $col < $colobjs_list->length) {
die "The column at index $col does not exists.";
}
my $colobj = (R::DataFrame::Column)$colobjs_list->get($col);
my $ndarray = $colobj->{ndarray};
return $ndarray;
}
method first_col : R::NDArray () {
return $self->col_by_index(0);
}
method col : R::NDArray ($colname : string) {
unless ($self->exists_col($colname)) {
die "The column named \"$colname\" dose not exist.";
}
my $col = $self->colindex($colname);
my $ndarray = $self->col_by_index($col);
return $ndarray;
}
method set_col : void ($colname : string, $ndarray : R::NDArray) {
my $exists_col = $self->exists_col($colname);
if ($exists_col) {
my $col = $self->colindex($colname);
my $colobjs_list = $self->{colobjs_list};
my $colobj = (R::DataFrame::Column)$colobjs_list->get($col);
$colobj->{ndarray} = $ndarray;
$ndarray->make_dim_read_only;
}
else {
$self->insert_col($colname, $ndarray);
}
}
method insert_col : void ($colname : string, $ndarray : R::NDArray, $before_colname : string = undef) {
unless (length $colname) {
die "The column name \$colname must be a non-empty string.";
}
unless ($ndarray) {
die "The n-dimensional array \$ndarray must be defined.";
}
my $exists_col = $self->exists_col($colname);
if ($exists_col) {
die "The column named \"$colname\" already exists.";
}
if ($self->ncol > 0) {
unless (R::OP->equals_dim($ndarray, $self->first_col)) {
die "The dimensions of the n-dimensional array \$ndarray must be equal to the dimensions of the n-dimensional array of the first column of this data frame.";
}
}
my $colobj = new R::DataFrame::Column;
$colobj->{name} = $colname;
$colobj->{ndarray} = $ndarray;
$ndarray->make_dim_read_only;
my $col = -1;
if ($before_colname) {
$col = $self->colindex($before_colname);
}
else {
$col = $self->ncol;
}
my $colobjs_list = $self->{colobjs_list};
$colobjs_list->insert($col, $colobj);
$self->update_colobjs_indexes_h($col);
}
method ncol : int () {
my $colobjs_list = $self->{colobjs_list};
my $ncol = $colobjs_list->length;
return $ncol;
}
method nrow : int () {
my $ncol = $self->ncol;
my $nrow = 0;
if ($ncol > 0) {
unless ($self->first_col->is_vector) {
die "The n-dimensional array of the first column of this data frame must be a vector.";
}
$nrow = $self->first_col->length;
}
return $nrow;
}
method remove_col : void ($colname : string) {
my $col = $self->colindex($colname);
my $colobjs_list = $self->{colobjs_list};
$colobjs_list->remove($col);
my $colobjs_indexes_h = $self->{colobjs_indexes_h};
$colobjs_indexes_h->delete($colname);
$self->update_colobjs_indexes_h($col);
}
method clone : R::DataFrame ($shallow : int = 0) {
my $ncol = $self->ncol;
my $colnames = $self->colnames;
my $ret_data_frame = R::DataFrame->new;
for my $colname (@$colnames) {
my $ndarray = $self->col($colname);
my $ret_ndarray = $ndarray->clone($shallow);
$ret_data_frame->insert_col($colname, $ret_ndarray);
}
return $ret_data_frame;
}
method to_string : string () {
my $ncol = $self->ncol;
my $string_buffer = StringBuffer->new;
if ($ncol == 0) {
$string_buffer->push("Empty");
}
else {
my $first_ndarray = $self->first_col;
my $first_dim = $first_ndarray->dim;
my $first_dim_length = @$first_dim;
if ($first_dim_length <= 1) {
for (my $col = 0; $col < $ncol; $col++) {
my $colname = $self->colname($col);
$string_buffer->push("$colname");
unless ($col == $ncol - 1) {
$string_buffer->push("\t");
}
}
$string_buffer->push("\n");
my $nrow = $self->nrow;
for (my $row = 0; $row < $nrow; $row++) {
for (my $col = 0; $col < $ncol; $col++) {
my $ndarray = $self->col_by_index($col);
my $elem_string = $ndarray->elem_to_string($ndarray->data, $row);
$string_buffer->push($elem_string);
unless ($col == $ncol - 1) {
$string_buffer->push("\t");
}
}
$string_buffer->push("\n");
}
}
else {
for (my $col = 0; $col < $ncol; $col++) {
my $colname = $self->colname($col);
my $ndarray = $self->col_by_index($col);
my $dim = $ndarray->dim;
my $dim_length = @$dim;
$string_buffer->push("$colname\n");
my $ndarray_string = $ndarray->to_string;
$string_buffer->push($ndarray_string);
$string_buffer->push("\n\n");
}
}
}
my $string = $string_buffer->to_string;
return $string;
}
method slice : R::DataFrame ($colnames : string[], $axis_indexes_product : R::NDArray::Int[]) {
my $ncol = $self->ncol;
my $colobjs_indexes_h = $self->{colobjs_indexes_h};
unless ($colnames) {
$colnames = $self->colnames;
}
my $ret_data_frame = R::DataFrame->new;
for my $colname (@$colnames) {
my $ndarray = $self->col($colname);
my $ret_ndarray = $ndarray->slice($axis_indexes_product);
$ret_data_frame->insert_col($colname, $ret_ndarray);
}
return $ret_data_frame;
}
method set_order : void ($data_indexes_ndarray : R::NDArray::Int) {
my $ncol = $self->ncol;
for (my $col = 0; $col < $ncol; $col++) {
my $ndarray = $self->col_by_index($col);
$ndarray->set_order($data_indexes_ndarray);
}
}
method sort : void ($colnames_with_sort_order : string[]) {
my $order_ndarray = $self->order($colnames_with_sort_order);
$self->set_order($order_ndarray);
}
method order : R::NDArray::Int ($colnames_with_sort_order : string[]) {
unless ($colnames_with_sort_order) {
die "The column names \$colnames_with_sort_order must be defined.";
}
my $ncol = $self->ncol;
unless ($ncol > 0) {
die "The column numbers of this data frame must be greater than 0.";
}
my $first_ndarray = $self->first_col;
my $data_length = $self->first_col->length;
my $order_data_indexes_ndarray = R::OP::Int->seq(0, $data_length - 1);
my $order_data = $order_data_indexes_ndarray->data;
my $colnames_length = @$colnames_with_sort_order;
my $colnames = new string[$colnames_length];
my $order_by_desc_h = Hash->new;
for (my $col = 0; $col < $colnames_length; $col++) {
my $colname_with_order_by = $colnames_with_sort_order->[$col];
my $ret = Fn->split(" ", $colname_with_order_by);
my $colname = $ret->[0];
my $order_by = (string)undef;
if (@$ret > 1) {
$order_by = $ret->[1];
}
else {
$order_by = "asc";
}
unless ($self->exists_col($colname)) {
die "The column named \"$colname\" does not exist.";
}
unless (@$ret >= 1 && @$ret <= 2 && ($order_by eq "asc" || $order_by eq "desc")) {
die "The column name with the sort order \"$colname_with_order_by\" is invalid format.";
}
$colnames->[$col] = $colname;
if ($order_by eq "desc") {
$order_by_desc_h->set($colname, 1);
}
else {
$order_by_desc_h->set($colname, 0);
}
}
Sort->sort_int($order_data, [has that : R::DataFrame = $self, $colnames : string[], $order_by_desc_h : Hash] method : int ($a_data_index : int, $b_data_index : int) {
my $cmp = 0;
for my $colname (@$colnames) {
my $ndarray = $self->{that}->col($colname);
my $desc = $order_by_desc_h->get_int($colname);
if ($desc) {
$cmp = $ndarray->elem_cmp($ndarray->data, $b_data_index, $ndarray->data, $a_data_index);
}
else {
$cmp = $ndarray->elem_cmp($ndarray->data, $a_data_index, $ndarray->data, $b_data_index);
}
unless ($cmp == 0) {
last;
}
}
return $cmp;
});
return $order_data_indexes_ndarray;
}
# Private Class Methods
private static method reverse_int : void ($array : int[]) {
unless ($array) {
die "The array \$array must be defined.";
}
my $length = @$array;
for (my $i = 0; $i < $length / 2; $i++) {
my $temp = $array->[$i];
$array->[$i] = $array->[$length - $i - 1];
$array->[$length - $i - 1] = $temp;
}
}
# Private Static Methods
private method update_colobjs_indexes_h : void ($begin_col : int) {
my $colobjs_list = $self->{colobjs_list};
my $colobjs_indexes_h = $self->{colobjs_indexes_h};
for (my $i = $begin_col; $i < $colobjs_list->length; $i++) {
my $colobj = (R::DataFrame::Column)$colobjs_list->get($i);
my $colname = $colobj->{name};
$colobjs_indexes_h->set($colname, $i);
}
}
}