package Go::Tokenize;
use warnings;
use strict;
use Carp;
use utf8;
require Exporter;
our @ISA = qw(Exporter);
our @EXPORT_OK = qw/tokenize/;
our %EXPORT_TAGS = (
all => \@EXPORT_OK,
);
our $VERSION = '0.02';
use Text::LineNumber;
use C::Tokenize qw!$comment_re!;
our $bt_string_re = qr!`[^`]*`!;
our $q_string_re = qr!"(\\"|[^"])*"!;
our $string_re = qr!(?:$bt_string_re|$q_string_re)!;
# https://golang.org/ref/spec#Keywords
# PAUSE thinks this is package switch without the newline
our @keywords = qw!
break default func interface select
case defer go map struct
chan else goto package
switch
const fallthrough if range type
continue for import return var
!;
# https://golang.org/ref/spec#Operators_and_punctuation
our $operator_re;
{
# Perl makes an error message "Possible attempt to separate words with
# commas at lib/Go/Tokenize.pm line 40." See
# https://stackoverflow.com/questions/19573977/
no warnings 'qw';
my @operators = (qw@
+ & += &= && == != ( )
- | -= |= || < <= [ ]
* ^ *= ^= <- > >= { }
/ << /= <<= ++ = := , ;
% >> %= >>= -- ! ... . :
&^ &^=
@);
$operator_re = make_re (@operators);
}
our $keyword_re = make_re (@keywords);
our $integer_re = qr!
0
|
[1-9][0-9_]*
|
0[bB][01_]+
|
0[oO]?[0-7_]+
|
0[xX][0-9a-fA-F_]+
!x;
our $numeric_re = qr!
u?int(?:8|16|32|64)?
|
float(?:32|64)
|
complex(?:64|128)
|
byte
|
rune
|
uintptr
!x;
# https://perldoc.perl.org/perlre
# https://perldoc.perl.org/perlunicode
# https://golang.org/ref/spec#unicode_letter
# PropertyValueAliases.txt
# https://golang.org/ref/spec#Letters_and_digits
my $letter = qr!\p{L}|_!;
our $identifier_re = qr!$letter(?:$letter|\p{Nd})*!;
our $rune_re = qr!
'(?:
.
|
\\u(?:[0-9a-fA-F]{4})
|
\\U(?:[0-9a-fA-F]{8})
|
\\o(?:[0-7]{3})
|
\\x(?:[0-9a-fA-F]{2})
|
# https://golang.org/ref/spec#escaped_char
\\[abfnrtv\\'"]
)'!x;
our $whitespace_re = qr!\x20|\x09|\x0D|\x0A!;
our $go_re = qr!
# Comment must go before everything else.
(?<comment>$comment_re)
|
# String must go before everything except comments.
(?<string>$string_re)
|
(?<keyword>$keyword_re)
|
(?<operator>$operator_re)
|
(?<integer>$integer_re)
|
(?<numeric>$numeric_re)
|
(?<identifier>$identifier_re)
|
(?<whitespace>$whitespace_re)
!x;
our @types = (qw!
comment
identifier
integer
keyword
numeric
operator
rune
string
whitespace
!);
sub tokenize
{
my ($go) = @_;
my $tln = Text::LineNumber->new ($go);
my @tokens;
while ($go =~ /($go_re)/g) {
my %token;
$token{contents} = $1;
$token{end} = pos ($go);
$token{start} = $token{end} - length ($token{contents}) + 1;
for my $type (@types) {
if ($+{$type}) {
$token{type} = $type;
last;
}
}
$token{line} = $tln->off2lnr ($token{start});
push @tokens, \%token;
}
return \@tokens;
}
sub make_re
{
my @sorted = sort {length ($b) <=> length ($a) || $a cmp $b} @_;
my @quoted = map {quotemeta ($_)} @sorted;
my $re = join '|', @quoted;
return $re;
}
1;