##
# name: WikiText::Socialtext::Parser
# abstract: Socialtext WikiText Parser Module
# author: Ingy döt Net <ingy@cpan.org>
# license: perl
# copyright: 2008, 2010, 2011
use strict;
# Reusable regexp generators used by the grammar
my $ALPHANUM = '\p{Letter}\p{Number}\pM';
# These are all stolen from URI.pm
my $reserved = q{;/?:@&=+$,[]#};
my $mark = q{-_.!~*'()};
my $unreserved = "A-Za-z0-9\Q$mark\E";
my $uric = quotemeta($reserved) . $unreserved . "%";
my %im_types = (
yahoo => 'yahoo',
ymsgr => 'yahoo',
callto => 'callto',
skype => 'callto',
callme => 'callto',
aim => 'aim',
msn => 'msn',
asap => 'asap',
);
my $im_re = join '|', keys %im_types;
sub create_grammar {
my $all_phrases = [
qw(waflphrase asis wikilink a im mail tt b i del)
];
my $all_blocks = [
qw(
pre wafl_block
hr hx
waflparagraph
ul ol
blockquote table
p empty
else
)
];
return {
_all_blocks => $all_blocks,
_all_phrases => $all_phrases,
top => {
blocks => $all_blocks,
},
empty => {
match => qr/^\s*\n/,
filter => sub {
my $node = shift;
$node->{type} = '';
},
},
wafl_block => {
match => qr/(?:^\.([\w\-]+)\ *\n)((?:.*\n)*?)(?:\.\1\ *\n|\z)/,
},
p => {
match => qr/^( # Capture whole thing
(?m:
^(?! # All consecutive lines *not* starting with
(?:
[\#\-\*]+[\ ] |
[\^\|\>] |
\.\w+\s*\n |
\{[^\}]+\}\s*\n
)
)
.*\S.*\n
)+
)
(\s*\n)* # and all blank lines after
/x,
phrases => $all_phrases,
filter => sub { chomp },
},
else => {
match => qr/^(.*)\n/,
phrases => [],
filter => sub {
my $node = shift;
$node->{type} = 'p';
},
},
pre => {
match => qr/^(?m:^\.pre\ *\n)((?:.*\n)*?)(?m:^\.pre\ *\n)(?:\s*\n)?/,
},
blockquote => {
match => qr/^((?m:^>.*\n)+)(\s*\n)?/,
blocks => $all_blocks,
filter => sub {
s/^>\ ?//gm;
},
},
waflparagraph => {
match => qr/^\{(.*)\}[\ \t]*\n(?:\s*\n)?/,
filter => sub {
my $node = shift;
my ($function, $options) = split /[: ]/, $node->{text}, 2;
my $replacement = defined $1 ? $1 : '';
$options = '' unless defined $options; # protect against an undefined here
$options =~ s/\s*(.*?)\s*/$replacement/;
$node->{attributes}{function} = $function;
$node->{attributes}{options} = $options;
undef $_;
},
},
hx => {
match => qr/^(\^+) *(.*?)(\s+=+)?\s*?\n+/,
phrases => $all_phrases,
filter => sub {
my $node = shift;
$node->{type} = 'h' . length($node->{1});
$_ = $node->{text} = $node->{2};
},
},
ul => {
match => re_list('[\*\-\+]'),
blocks => [qw(ul ol subl li)],
filter => sub { s/^[\*\-\+\#] *//mg },
},
ol => {
match => re_list('\#'),
blocks => [qw(ul ol subl li)],
filter => sub { s/^[\*\#] *//mg },
},
subl => {
type => 'li',
match => qr/^( # Block must start at beginning
# Capture everything in $1
(.*)\n # Capture the whole first line
[\*\#]+\ .*\n # Line starting with one or more bullet
(?:[\*\#]+\ .*\n)* # Lines starting with '*' or '#'
)(?:\s*\n)?/x, # Eat trailing lines
blocks => [qw(ul ol li2)],
},
li => {
match => qr/(.*)\n/, # Capture the whole line
phrases => $all_phrases,
},
li2 => {
type => '',
match => qr/(.*)\n/, # Capture the whole line
phrases => $all_phrases,
},
hr => {
match => qr/^--+(?:\s*\n)?/,
},
table => {
match => qr/^(
(
(?m:^\|.*\|\ \n(?=\|))
|
(?m:^\|.*\|\ \ +\n)
|
(?ms:^\|.*?\|\n)
)+
)(?:\s*\n)?/x,
blocks => ['tr'],
},
tr => {
match => qr/^((?m:^\|.*?\|(?:\n| \n(?=\|)| +\n)))/s,
blocks => ['td'],
filter => sub { s/\s+\z// },
},
# XXX Need to support blocks in TD
td => {
match => qr/\|?\s*(.*?)\s*\|\n?/s,
phrases => $all_phrases,
},
wikilink => {
match => qr/
(?:"([^"]*)"\s*)?(?:^|(?<=[^$ALPHANUM]))\[(?=[^\s\[\]])
(.*?)
\](?=[^$ALPHANUM]|\z)
/x,
filter => sub {
my $node = shift;
$node->{attributes}{target} = $node->{2};
$_ = $node->{1} || $node->{2};
},
},
b => {
match => re_huggy(q{\*}),
phrases => $all_phrases,
},
tt => {
match => re_huggy(q{\`}),
},
i => {
match => WikiText::Socialtext::Parser::re_huggy(q{\_}),
phrases => $all_phrases,
},
del => {
match => re_huggy(q{\-}),
phrases => $all_phrases,
},
im => {
match => qr/(\b(?:$im_re)\:[^\s\>\)]+)/,
filter => sub {
my $node = shift;
my ($type, $id) = split /:/, $node->{text}, 2;
$node->{attributes}{type} = $type;
$node->{attributes}{id} = $id;
undef $_;
},
},
waflphrase => {
match => qr/
(?:^|(?<=[\s\-]))
(?:"(.+?)")?
\{
([\w-]+)
(?=[\:\ \}])
(?:\s*:)?
\s*(.*?)\s*
\}
(?=[^A-Za-z0-9]|\z)
/x,
filter => sub {
my $node = shift;
my ($label, $function, $options) = @{$node}{qw(1 2 3)};
$label ||= '';
$node->{attributes}{function} = $function;
$node->{attributes}{options} = $options;
$_ = $label;
},
},
mail => {
match => qr/
(?:"([^"]*)"\s*)?
<?
(?:mailto:)?
([\w+%\-\.]+@(?:[\w\-]+\.)+[\w\-]+)
>?
/x,
filter => sub {
my $node = shift;
$_ = $node->{1} || $node->{2};
$node->{attributes}{address} = $node->{2};
},
},
a => {
type => 'hyperlink',
match => qr{
(?:"([^"]*)"\s*)?
<?
(
(?:http|https|ftp|irc|file):
(?://)?
[$uric]+
[A-Za-z0-9/#]
)
>?
}x,
filter => sub {
my $node = shift;
$_ = $node->{1} || $node->{2};
$node->{attributes}{target} = $node->{2};
},
},
asis => {
match => qr/
\{\{
(.*?)
\}\}(\}*)
/xs,
filter => sub {
my $node = shift;
$node->{type} = '';
$_ = $node->{1} . $node->{2};
},
},
};
}
sub re_huggy {
my $brace1 = shift;
my $brace2 = shift || $brace1;
qr/
(?:^|(?<=[^{$ALPHANUM}$brace1]))$brace1(?=\S)(?!$brace2)
(.*?)
$brace2(?=[^{$ALPHANUM}$brace2]|\z)
/x;
}
sub re_list {
my $bullet = shift;
return qr/^( # Block must start at beginning
# Capture everything in $1
^$bullet+\ .*\n # Line starting with one or more bullet
(?:[\*\-\+\#]+\ .*\n)* # Lines starting with '*' or '#'
)(?:\s*\n)?/x, # Eat trailing lines
}
1;