#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#include "ppport.h"
static
int
max_chars = 2000;
static
int
max_lines = 20;
static
int
max_cpl = 300;
static
int
verbose = 0;
struct
char_stat
{
char
top_four[4];
unsigned
int
n_top_four[4];
unsigned
int
n_wordsx;
int
n_distinct;
int
n_nonprint;
int
n_whitespace;
int
n_iswordx;
int
n_other;
int
wet_indent;
};
struct
line
{
int
offset;
int
offset_l;
int
end_offset;
int
length;
int
weight;
struct
char_stat s;
};
#define MAX_INDENT_WETNESS 2 // two printable nonword chars allowed in indent.
#define WEIGHT_BLANK_LINE 800 // an empty line (applies to current line).
#define WEIGHT_NO_INDENT 400 // no longer indented line start
#define WEIGHT_wW_CHANGE 200 // word-char -> non-word-char transition
#define WEIGHT_INDENT_DIFF 100 // add/subtract max for indent change.
#define WEIGHT_LLEN_DIFF 300 // cut after short lines.
#define WEIGHT_DISTINCT_DIFF 50 // add/subtract max for distinct change.
#define WEIGHT_BALANCE_CHANGE 3 // other/iswordx balance changes
#if 0 //UNUSED
#define WEIGHT_Ww_START 100 // non-word-char start -> word-char start (applies to prev line)
#define WEIGHT_LINE_LENGTH -1 // per character "cut after short lines" (applies to current line)
#endif
struct
char_input
{
int
offset;
int
next_offset;
int
ch, next_ch, last_ch;
int
next_ch_raw;
};
#define char_input_init(in) \
do
{ \
(in)->ch =
'\0'
; \
(in)->last_ch =
'\0'
; \
(in)->next_ch =
'\n'
; \
(in)->offset = -1; \
}
while
(0)
static
int
do_fetch(
int
*offp,
const
char
*text,
int
len)
{
while
(++(*offp) < len)
if
(text[*offp])
return
text[*offp];
*offp = len;
return
'\n'
;
}
static
int
char_input_fetch(
struct
char_input *in,
const
char
*text,
int
len)
{
if
(!in->ch)
{
in->ch = do_fetch(&in->offset, text, len);
in->next_offset = in->offset;
in->next_ch = do_fetch(&in->next_offset, text, len);
}
else
{
in->last_ch = in->ch;
in->ch = in->next_ch_raw;
in->offset = in->next_offset;
in->next_ch = do_fetch(&in->next_offset, text, len);
}
if
((in->ch ==
'\n'
&& in->next_ch ==
'\r'
) ||
(in->ch ==
'\r'
&& in->next_ch ==
'\n'
))
in->next_ch = do_fetch(&in->next_offset, text, len);
in->next_ch_raw = in->next_ch;
if
(in->ch ==
'\v'
) in->ch =
'\f'
;
if
(in->ch ==
'\r'
) in->ch =
'\n'
;
if
(in->next_ch ==
'\r'
) in->next_ch =
'\n'
;
return
in->ch;
}
static
int
is_word_ch_x(
int
c)
{
if
((c <=
'z'
&& c >=
'a'
) ||
(c <=
'Z'
&& c >=
'A'
) ||
(c <=
'9'
&& c >=
'0'
) ||
c ==
'_'
|| c ==
'$'
||
c ==
':'
|| c ==
'.'
)
return
1;
return
0;
}
static
int
char_stat(
struct
char_stat *s,
const
char
*text,
int
len)
{
int
ch_arr[128-32];
int
i, j, wetness = 0, in_word = 0;
for
(i = 0; i < 128-32; i++) ch_arr[i] = 0;
while
((len > 0) && (text[len-1] <=
' '
)) len--;
bzero((
void
*)s,
sizeof
(
struct
char_stat));
s->wet_indent = -1;
for
(i = 0; i < len; i++)
{
unsigned
char
c = (unsigned
char
)text[i];
if
(c ==
' '
|| c ==
'\t'
|| c ==
'\0'
|| c ==
'\n'
|| c ==
'\r'
)
{
in_word = 0;
s->n_whitespace++;
}
else
if
(c < 32 || c > 127)
{
in_word = 0;
if
(s->wet_indent < 0) s->wet_indent = i - wetness;
s->n_nonprint++;
}
else
{
ch_arr[c-32]++;
if
(is_word_ch_x(c))
{
if
(s->wet_indent < 0) s->wet_indent = i - wetness;
s->n_iswordx++;
if
(!in_word) s->n_wordsx++;
in_word++;
}
else
{
in_word = 0;
if
((s->wet_indent < 0) && (++wetness > MAX_INDENT_WETNESS))
s->wet_indent = i - wetness + 1;
s->n_other++;
}
}
}
if
(s->wet_indent < 0) s->wet_indent = len - wetness;
for
(j = 0; j < 128-32; j++)
{
if
(ch_arr[j]) s->n_distinct++;
}
for
(i = 0; i < 4; i++)
{
int
m = -1;
int
c = -1;
for
(j = 0; j < 128-32; j++)
{
if
(ch_arr[j] > m)
{
m = ch_arr[j];
c = j+32;
}
}
s->top_four[i] = c;
s->n_top_four[i] = m;
ch_arr[c-32] = 0;
}
return
len;
}
static
void
weight_line(
struct
line *l1,
struct
line *l2,
const
char
*text)
{
struct
char_stat *s1, *s2;
s1 = &l1->s;
s2 = &l2->s;
if
(!l1->length)
l1->length = char_stat(s1, text+l1->offset, l1->end_offset - l1->offset);
if
(!l2->length)
l2->length = char_stat(s2, text+l2->offset, l2->end_offset - l2->offset);
if
(s1->wet_indent > 0 && s2->wet_indent == 0)
l1->weight += WEIGHT_NO_INDENT;
if
(s2->wet_indent > 0)
l1->weight += (s1->wet_indent - s2->wet_indent) * WEIGHT_INDENT_DIFF /
(s1->wet_indent + s2->wet_indent);
if
((s1->n_iswordx > s1->n_other) &&
(s2->n_iswordx < s2->n_other))
l1->weight += (s1->n_iswordx + s2->n_other - s1->n_other - s2->n_iswordx)
* WEIGHT_BALANCE_CHANGE;
if
((s2->n_iswordx > s2->n_other) &&
(s1->n_iswordx < s1->n_other))
l1->weight += (s2->n_iswordx + s1->n_other - s2->n_other - s1->n_iswordx)
* WEIGHT_BALANCE_CHANGE;
if
((s1->n_iswordx == 0) && s2->n_iswordx > 0)
l1->weight += WEIGHT_wW_CHANGE;
else
l1->weight += s2->n_iswordx - s1->n_iswordx;
if
(s2->n_distinct > 0)
l1->weight += (s2->n_distinct - s1->n_distinct) * WEIGHT_DISTINCT_DIFF /
(s2->n_distinct + s1->n_distinct);
if
(l2->length > 0)
l1->weight += (l2->length - l1->length) * WEIGHT_LLEN_DIFF /
(l2->length + l1->length);
if
(l2->weight >= WEIGHT_BLANK_LINE)
{
l2->weight += l1->weight;
}
}
static
void
push_fracture(AV *r,
struct
line *larr,
int
start_idx,
int
end_idx)
{
AV *fra;
int
s_e_offset, s_offset_l;
struct
line *e = &larr[end_idx];
if
(start_idx < 0)
{
s_e_offset = 0;
s_offset_l = 1;
}
else
{
s_e_offset = larr[start_idx].end_offset;
s_offset_l = larr[start_idx].offset_l;
}
fra = (AV *)sv_2mortal((SV *)newAV());
av_push(fra, newSVnv(s_e_offset));
av_push(fra, newSVnv(e->end_offset - s_e_offset));
av_push(fra, newSVnv(s_offset_l));
av_push(fra, newSVnv(e->offset_l - s_offset_l+1));
av_push(r, newRV((SV *)fra));
}
static
int
is_small_enough(
struct
line *larr,
int
start_idx,
int
end_idx,
int
max_lines,
int
max_chars)
{
int
n_chars = 0;
struct
line *l;
if
((end_idx - start_idx) < 2)
return
1;
if
((end_idx - start_idx) > max_lines)
return
0;
for
(l = &larr[start_idx+1]; l <= &larr[end_idx]; l++)
n_chars += l->length;
if
(n_chars > max_chars)
return
0;
return
1;
}
static
int
find_max_idx(
struct
line *larr,
int
i1,
int
i2)
{
int
m = larr[i1].weight;
int
i = i1;
int
r = i;
while
(++i < i2)
{
if
(m < larr[i].weight)
{
m = larr[i].weight;
r = i;
}
}
return
r;
}
static
void
bisect_fract(AV *r,
struct
line *larr,
int
start_idx,
int
end_idx,
int
max_lines,
int
max_chars)
{
int
bisec_idx;
int
ss, ee;
if
(is_small_enough(larr, start_idx, end_idx, max_lines, max_chars))
{
push_fracture(r, larr, start_idx, end_idx);
return
;
}
ss = 1;
ee = 1;
bisec_idx = find_max_idx(larr, start_idx+ss, end_idx-ee);
while
(bisec_idx == end_idx-ee-1)
{
ee++;
if
(end_idx-ee < start_idx+ss + 2)
break
;
bisec_idx = find_max_idx(larr, start_idx+ss, end_idx-ee);
}
while
(bisec_idx == start_idx+ss)
{
ss++;
if
(end_idx-ee < start_idx+ss + 2)
break
;
bisec_idx = find_max_idx(larr, start_idx+ss, end_idx-ee);
}
while
(bisec_idx == end_idx-ee-1)
{
ee++;
if
(end_idx-ee < start_idx+ss + 2)
break
;
bisec_idx = find_max_idx(larr, start_idx+ss, end_idx-ee);
}
bisect_fract(r, larr, start_idx, bisec_idx, max_lines, max_chars);
bisect_fract(r, larr, bisec_idx, end_idx, max_lines, max_chars);
}
MODULE = Text::Fracture PACKAGE = Text::Fracture
PROTOTYPES: ENABLE
int
init(obj)
HV *obj
PREINIT:
SV** pp;
CODE:
pp = hv_fetch(obj,
"max_chars"
, 9, 0);
if
(pp) max_chars = SvUV(*pp);
pp = hv_fetch(obj,
"max_lines"
, 9, 0);
if
(pp) max_lines = SvUV(*pp);
pp = hv_fetch(obj,
"max_cpl"
, 7, 0);
if
(pp) max_cpl = SvUV(*pp);
pp = hv_fetch(obj,
"verbose"
, 7, 0);
if
(pp) verbose = SvUV(*pp);
if
(max_chars < max_cpl) croak(
"max_chars=%d must be greater than max_cpl=%d\n"
, max_chars, max_cpl);
if
(max_lines <= 1) croak(
"max_lines must > 1, not %d\n"
, max_lines);
RETVAL = 1;
OUTPUT:
RETVAL
SV *
do_fract(sv_text)
SV *sv_text
PREINIT:
int
larr_size = 0;
int
larr_idx = 0;
struct
line *larr = NULL;
struct
line *l = NULL;
int
line_count_total;
int
line_count;
AV *r;
int
text_lnr;
int
line_off;
int
last_nonprint_off;
int
last_whitespace_off;
int
last_nonwordx_off;
STRLEN text_len;
struct
char_input in;
const
char
*text;
INIT:
text = (
const
char
*)SvPV(sv_text, text_len);
line_count = line_count_total = 0;
r = (AV *)sv_2mortal((SV *)newAV());
last_whitespace_off = last_nonprint_off = last_nonwordx_off = 0;
CODE:
if
(verbose) warn(
" max_chars=%d, max_lines=%d, max_cpl=%d\n text_len=%d\n"
,
max_chars, max_lines, max_cpl, (
int
)text_len);
line_off = 0;
char_input_init(&in);
while
(in.offset < (
int
)text_len)
{
char_input_fetch(&in, text, (
int
)text_len);
if
((in.ch ==
'\n'
) || (in.ch ==
'\f'
) ||
(in.next_offset - line_off > max_cpl))
{
larr_size += (
int
)((in.next_offset - line_off) / max_cpl * 2) + 1;
line_off = in.next_offset;
}
}
larr_size++;
larr = (
struct
line *)
calloc
(
sizeof
(
struct
line), larr_size+1);
line_off = 0;
text_lnr = 1;
larr_idx = 0;
char_input_init(&in);
while
(in.offset < (
int
)text_len)
{
unsigned
char
c;
char_input_fetch(&in, text, (
int
)text_len);
c = (unsigned
char
)in.ch;
if
(!is_word_ch_x(c))
last_nonwordx_off = in.offset;
if
(c ==
' '
|| c ==
'\t'
|| c ==
'\n'
|| c ==
'\r'
)
last_whitespace_off = in.offset;
if
((c && c < 32) || c > 127)
last_nonprint_off = in.offset;
if
((in.ch ==
'\n'
) || (in.ch ==
'\f'
))
{
l = &larr[larr_idx++];
l->offset = line_off;
l->offset_l = text_lnr;
line_off = l->end_offset = in.next_offset;
if
((in.ch ==
'\f'
) || (in.ch ==
'\n'
&& in.last_ch ==
'\n'
))
l->weight = WEIGHT_BLANK_LINE;
text_lnr++;
}
else
if
(in.next_offset - line_off > max_cpl)
{
int
break_off, min_break_off;
l = &larr[larr_idx++];
l->offset = line_off;
l->offset_l = text_lnr;
break_off = in.next_offset;
min_break_off = line_off+max_cpl/2;
if
(last_whitespace_off > min_break_off)
break_off = last_whitespace_off;
else
if
(last_nonprint_off > min_break_off)
break_off = last_nonprint_off;
else
if
(last_nonwordx_off > min_break_off)
break_off = last_nonwordx_off;
line_off = l->end_offset = break_off;
}
}
assert
(larr_idx < larr_size);
l = &larr[larr_idx];
l->offset = line_off;
l->offset_l = text_lnr;
l->end_offset = in.next_offset;
l->weight = WEIGHT_BLANK_LINE * 2;
larr_size = larr_idx;
for
(larr_idx = 0; larr_idx <= larr_size; larr_idx++)
{
weight_line(&larr[larr_idx], &larr[larr_idx+1], text);
if
(verbose > 1)
{
printf
(
"larr[%d].w=%-5d lno=%-3d l=%d\n"
,
larr_idx, larr[larr_idx].weight,
larr[larr_idx].offset_l, larr[larr_idx].length);
}
}
bisect_fract(r, larr, -1, larr_size, max_lines, max_chars);
if
(larr)
free
((
void
*)larr);
RETVAL = newRV((SV *)r);
OUTPUT:
RETVAL