#!/usr/bin/env perl
use
5.10.1;
const
my
$DEBUG
=>
$ENV
{DEBUG} // 1;
const
my
$TEST
=>
$ENV
{TEST} // 1;
const
my
$LF
=>
"\n"
;
const
my
$CR
=>
"\r"
;
const
my
$CRLF
=>
"\r\n"
;
BEGIN {
$| = 1;
Log::Log4perl->easy_init(
$ERROR
);
use_ok(
'CrawlerCommons::RobotRulesParser'
);
}
try
{
subtest
"test empty rules"
,
sub
{ test_empty_rules(); };
subtest
"test query param in disallow"
,
sub
{
test_query_param_in_disallow(); };
subtest
"google pattern matching"
,
sub
{ test_google_pattern_matching(); };
subtest
"test commented-out line"
,
sub
{ test_commented_out_lines(); };
subtest
"test robots.txt always allow"
,
sub
{
test_robots_text_always_allowed(); };
subtest
"test agent not listed"
,
sub
{ test_agent_not_listed(); };
subtest
"test non-ascii encoding"
,
sub
{ test_non_ascii_encoding(); };
subtest
"test simplest allow all"
,
sub
{ test_simplest_allow_all(); };
subtest
"test mixed endings"
,
sub
{ test_mixed_endings(); };
subtest
"test rfp cases"
,
sub
{ test_rfp_cases(); };
subtest
"test nutch cases"
,
sub
{ test_nutch_cases(); };
subtest
"test html markup in robots.txt"
,
sub
{
test_html_markup_in_robots_txt(); };
subtest
"test ignore of html"
,
sub
{
test_ignore_of_html(); };
subtest
'test heritrix cases'
,
sub
{ test_heritrix_cases(); };
subtest
'test case-sensitive paths'
,
sub
{ test_case_sensitive_paths(); };
subtest
'test empty disallow'
,
sub
{ test_empty_disallow(); };
subtest
'test empty allow'
,
sub
{ test_empty_allow(); };
subtest
'test multi wildcard'
,
sub
{ test_multi_wildcard(); };
subtest
'test multi matches'
,
sub
{ test_multi_matches(); };
subtest
'test multi agent names'
,
sub
{ test_multi_agent_names(); };
subtest
'test multi-word agent name'
,
sub
{ test_multi_word_agent_name(); };
subtest
'test unsupported fields'
,
sub
{ test_unsupported_fields(); };
subtest
'test acap fields'
,
sub
{ test_acap_fields(); };
subtest
'test crawl delay'
,
sub
{ test_crawl_delay(); };
subtest
'test big crawl delay'
,
sub
{ test_big_crawl_delay(); };
subtest
"test broken krugle rotbots txt file"
,
sub
{ test_broken_krugle_robots_txt_file() };
subtest
"test robots with utf8 bom"
,
sub
{ test_robots_with_utf8_bom() };
subtest
"test robots with utf16le bom"
,
sub
{ test_robots_with_utf16le_bom() };
subtest
"test robots with utf16be bom"
,
sub
{ test_robots_with_utf16be_bom() };
subtest
"test floating point crawl delay"
,
sub
{ test_floating_point_crawl_delay() };
subtest
"test ignoring host"
,
sub
{ test_ignoring_host() };
subtest
"test directive typos"
,
sub
{ test_directive_typos() };
subtest
"test format errors"
,
sub
{ test_format_errors() };
subtest
"test extended standard"
,
sub
{ test_extended_standard() };
subtest
"test sitemap"
,
sub
{ test_sitemap() };
subtest
"test relative sitemap"
,
sub
{ test_relative_sitemap() };
subtest
"test many user agents"
,
sub
{ test_many_user_agents() };
subtest
"test malformed path in robots file"
,
sub
{ test_malformed_path_in_robots_file() };
subtest
"test dos line endings"
,
sub
{ test_dos_line_endings() };
subtest
"test amazon robots with wildcards"
,
sub
{ test_amazon_robots_with_wildcards() };
subtest
"test allow before disallow"
,
sub
{ test_allow_before_disallow() };
subtest
"test space in multiple user agent names"
,
sub
{ test_spaces_in_multiple_user_agent_names() };
subtest
"test sitemap at end of file"
,
sub
{ test_sitemap_at_end_of_file() };
}
catch
{
say
STDERR
"Testing ended unexpectedly: $_"
;
};
done_testing;
sub
create_robot_rules {
my
(
$crawler_name
,
$content
) =
@_
;
my
$parser
= CrawlerCommons::RobotRulesParser->new;
return
$parser
->parse_content(
$FAKE_ROBOTS_URL
,
$content
,
"text/plain"
,
$crawler_name
);
}
sub
test_case_sensitive_paths {
my
$robots_txt
=
join
$CRLF
,
"User-agent: *"
,
"Allow: /AnyPage.html"
,
"Allow: /somepage.html"
,
"Disallow: /"
;
my
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
$robots_txt
);
say
STDERR Data::Dumper->Dump([
$robot_rules
],[
'rules'
])
if
$DEBUG
> 1;
}
sub
test_commented_out_lines {
my
$robots_txt
=
join
(
$LF
,
"#user-agent: testAgent"
,
""
,
"#allow: /index.html"
,
"#allow: /test"
,
""
,
"#user-agent: test"
,
""
,
"#allow: /index.html"
,
"#disallow: /test"
,
""
,
"#user-agent: someAgent"
,
""
,
"#disallow: /index.html"
,
"#disallow: /test"
,
""
);
my
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
$robots_txt
);
}
sub
test_acap_fields {
my
$robots_txt
=
"acap-crawler: *"
.
$CRLF
.
"acap-disallow-crawl: /ultima_ora/"
;
my
$parser
= CrawlerCommons::RobotRulesParser->new;
my
$rr
=
$parser
->parse_content(
"url"
,
$robots_txt
,
"text/plain"
,
"foobot"
);
is(
$parser
->num_warnings(), 0);
}
sub
test_agent_not_listed {
my
$robots_txt
=
join
(
$CRLF
,
"User-agent: crawler1"
,
"Disallow: /index.html"
,
"Allow: /"
,
""
,
"User-agent: crawler2"
,
"Disallow: /"
);
my
$robot_rules
= create_robot_rules(
"crawler3"
,
$robots_txt
);
}
sub
test_allow_before_disallow {
my
$robots_txt
=
"User-agent: *"
.
$CRLF
.
"Disallow: /fish"
.
$CRLF
.
"Allow: /fish"
.
$CRLF
;
my
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
$robots_txt
);
}
sub
test_amazon_robots_with_wildcards {
my
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
path(__FILE__)->parent->child(
"./robots/wildcards.txt"
)
->slurp_utf8);
1);
is(
$robot_rules
0);
}
sub
test_big_crawl_delay {
my
$robots_txt
=
"User-agent: *"
.
$CR
.
"Crawl-delay: 3600"
.
$CR
.
"Disallow:"
.
$CR
;
my
$robot_rules
= create_robot_rules(
"bixo"
,
$robots_txt
);
"disallow all if huge crawl delay"
);
}
sub
test_broken_krugle_robots_txt_file {
my
$robots_txt
=
"User-agent: *"
.
$CR
.
"Disallow: /maintenance.html"
.
$CR
.
"Disallow: /perl/"
.
$CR
.
"Disallow: /cgi-bin/"
.
$CR
.
"Disallow: /examples/"
.
$CR
.
"Crawl-delay: 3"
.
$CR
.
""
.
$CR
.
"User-agent: googlebot"
.
$CR
.
"Crawl-delay: 1"
.
$CR
.
""
.
$CR
.
"User-agent: qihoobot"
.
$CR
.
"Disallow: /"
;
my
$robot_rules
= create_robot_rules(
"googlebot/2.1"
,
$robots_txt
);
}
sub
test_crawl_delay {
my
$robots_txt
=
"User-agent: bixo"
.
$CR
.
"Crawl-delay: 10"
.
$CR
.
"User-agent: foobot"
.
$CR
.
"Crawl-delay: 20"
.
$CR
.
"User-agent: *"
.
$CR
.
"Disallow:/baz"
.
$CR
;
my
$robot_rules
= create_robot_rules(
"bixo"
,
$robots_txt
);
is(10000,
$robot_rules
->crawl_delay,
"testing crawl delay for agent bixo - rule 1"
);
$robots_txt
=
"User-agent: foobot"
.
$CR
.
"Crawl-delay: 20"
.
$CR
.
"User-agent: *"
.
$CR
.
"Disallow:/baz"
.
$CR
;
$robot_rules
= create_robot_rules(
"bixo"
,
$robots_txt
);
is(
$CrawlerCommons::RobotRules::UNSET_CRAWL_DELAY
,
$robot_rules
->crawl_delay,
"testing crawl delay for agent bixo - rule 2"
);
}
sub
test_directive_typos {
my
$robot_rules
=
create_robot_rules(
"bot1"
,
path(__FILE__)->parent->child(
"./robots/directive-typos-robots.txt"
)
->slurp_utf8);
$robot_rules
=
create_robot_rules(
"bot2"
,
path(__FILE__)->parent->child(
"./robots/directive-typos-robots.txt"
)
->slurp_utf8);
"useragent"
);;
$robot_rules
=
create_robot_rules(
"bot3"
,
path(__FILE__)->parent->child(
"./robots/directive-typos-robots.txt"
)
->slurp_utf8);
"useg-agent"
);
$robot_rules
=
create_robot_rules(
"bot4"
,
path(__FILE__)->parent->child(
"./robots/directive-typos-robots.txt"
)
->slurp_utf8);
"useragent-no-colon"
);
}
sub
test_dos_line_endings {
my
$robot_rules
= create_robot_rules(
"bot1"
,
path(__FILE__)->parent->child(
"./robots/dos-line-endings.txt"
)
->slurp_utf8);
is(
$robot_rules
->crawl_delay, 1000);
}
sub
test_empty_allow {
my
$robots_txt
=
"User-agent: *"
.
$CRLF
.
"Allow:"
;
my
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
$robots_txt
);
}
sub
test_empty_disallow {
my
$robots_txt
=
"User-agent: *"
.
$CRLF
.
"Disallow:"
;
my
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
$robots_txt
);
}
sub
test_empty_rules {
my
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
""
);
'test empty rules'
);
}
sub
test_extended_standard {
my
$parser
= CrawlerCommons::RobotRulesParser->new;
my
$rr
=
$parser
->parse_content(
"url"
,
path(__FILE__)->parent->child(
"./robots/extended-standard-robots.txt"
)
->slurp_utf8,
"text/plain"
,
"foobot"
);
is(
$parser
->num_warnings(), 0);
}
sub
test_floating_point_crawl_delay {
my
$robots_txt
=
"User-agent: *"
.
$CR
.
"Crawl-delay: 0.5"
.
$CR
.
"Disallow:"
.
$CR
;
my
$robot_rules
= create_robot_rules(
"bixo"
,
$robots_txt
);
is(
$robot_rules
->crawl_delay, 500);
}
sub
test_format_errors {
my
$robot_rules
=
create_robot_rules(
"bot1"
,
path(__FILE__)->parent->child(
"./robots/format-errors-robots.txt"
)
->slurp_utf8);
0,
"whitespace-before-colon"
);
$robot_rules
=
create_robot_rules(
"bot2"
,
path(__FILE__)->parent->child(
"./robots/format-errors-robots.txt"
)
->slurp_utf8);
"no-colon-useragent"
);
$robot_rules
=
create_robot_rules(
"bot3"
,
path(__FILE__)->parent->child(
"./robots/format-errors-robots.txt"
)
->slurp_utf8);
1,
"whitespace-before-colon"
);
}
sub
test_google_pattern_matching {
my
$robots_txt
=
join
(
$CRLF
,
"User-agent: *"
,
"Disallow: /fish"
,
""
);
my
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
$robots_txt
);
$robots_txt
=
join
(
$CRLF
,
"User-agent: *"
,
"Disallow: /fish*"
,
""
);
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
$robots_txt
);
$robots_txt
=
join
(
$CRLF
,
"User-agent: *"
,
"Disallow: /fish/"
,
""
);
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
$robots_txt
);
$robots_txt
=
join
(
$CRLF
,
"User-agent: *"
,
"Disallow: /*.php"
,
""
);
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
$robots_txt
);;
$robots_txt
=
join
(
$CRLF
,
"User-agent: *"
,
'Disallow: /*.php$'
,
""
);
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
$robots_txt
);;
$robots_txt
=
join
(
$CRLF
,
"User-agent: *"
,
"Disallow: /fish*.php"
,
""
);
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
$robots_txt
);;
$robots_txt
=
join
(
$CRLF
,
"User-agent: *"
,
"Disallow: /*fish*.php"
,
""
);
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
$robots_txt
);;
}
sub
test_heritrix_cases {
my
$robots_txt
=
"User-agent: *\n"
.
"Disallow: /cgi-bin/\n"
.
"Disallow: /details/software\n"
.
"\n"
.
"User-agent: denybot\n"
.
"Disallow: /\n"
.
"\n"
.
"User-agent: allowbot1\n"
.
"Disallow: \n"
.
"\n"
.
"User-agent: allowbot2\n"
.
"Disallow: /foo\n"
.
"Allow: /\n"
.
"\n"
.
"User-agent: delaybot\n"
.
"Disallow: /\n"
.
"Crawl-Delay: 20\n"
.
"Allow: /images/\n"
;
my
$robot_rules
= create_robot_rules(
"Mozilla allowbot1 99.9"
,
$robots_txt
);
$robot_rules
= create_robot_rules(
"Mozilla allowbot2 99.9"
,
$robots_txt
);
$robot_rules
= create_robot_rules(
"Mozilla denybot 99.9"
,
$robots_txt
);
is(
$CrawlerCommons::RobotRules::UNSET_CRAWL_DELAY
,
$robot_rules
->crawl_delay);
$robot_rules
= create_robot_rules(
"Mozilla anonbot 99.9"
,
$robots_txt
);
$robot_rules
= create_robot_rules(
"Mozilla delaybot 99.9"
,
$robots_txt
);
is(20000,
$robot_rules
->crawl_delay);
}
sub
test_html_markup_in_robots_txt {
my
$robots_txt
=
join
""
,
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2 Final//EN\"><HTML>\n"
,
"<HEAD>\n"
,
"<TITLE>/robots.txt</TITLE>\n"
,
"</HEAD>\n"
,
"<BODY>\n"
,
"User-agent: anybot<BR>\n"
,
"Disallow: <BR>\n"
,
"Crawl-Delay: 10<BR>\n"
,
"\n"
,
"User-agent: *<BR>\n"
,
"Disallow: /<BR>\n"
,
"Crawl-Delay: 30<BR>\n"
,
"\n"
,
"</BODY>\n"
,
"</HTML>\n"
;
my
$robot_rules
= create_robot_rules(
"anybot"
,
$robots_txt
);
is(10000,
$robot_rules
->crawl_delay);
$robot_rules
= create_robot_rules(
"bogusbot"
,
$robots_txt
);
is(30000,
$robot_rules
->crawl_delay);
}
sub
test_ignore_of_html {
my
$robots_txt
=
"<HTML><HEAD><TITLE>Site under Maintenance</TITLE></HTML>"
;
my
$robot_rules
= create_robot_rules(
"anybot"
,
$robots_txt
);
is(
$robot_rules
->_defer_visits, 0);
}
sub
test_ignoring_host {
my
$robot_rules
= create_robot_rules(
"foobot"
,
path(__FILE__)->parent->child(
"./robots/www.flot.com-robots.txt"
)
->slurp_utf8);
"Disallow img directory"
)
}
sub
test_malformed_path_in_robots_file {
my
$robot_rules
= create_robot_rules(
"bot1"
,
path(__FILE__)->parent->child(
"./robots/malformed-path.txt"
)
->slurp_utf8);
is(
$robot_rules
->is_allowed(
0,
"Disallowed URL"
);
"Regular URL"
);
}
sub
test_many_user_agents {
my
$robot_rules
= create_robot_rules(
"wget"
,
path(__FILE__)->parent->child(
"./robots/many-user-agents.txt"
)
->slurp_utf8);
$robot_rules
= create_robot_rules(
"mysuperlongbotnamethatmatchesnothing"
,
path(__FILE__)->parent->child(
"./robots/many-user-agents.txt"
)
->slurp_utf8);
"many-user-agents"
);
}
sub
test_mixed_endings {
my
$robots_txt
=
'# comments to webmaster@fict.org'
,
"User-agent: unhipbot"
,
$CR
,
$LF
,
"Disallow: /"
,
$CR
,
""
,
$CRLF
,
"User-agent: webcrawler"
,
$LF
,
"User-agent: excite"
,
$CR
,
"Disallow: "
,
"\u0085"
,
$CR
,
"User-agent: *"
,
$CRLF
,
"Disallow: /org/plans.html"
,
$LF
,
"Allow: /org/"
,
$CR
,
"Allow: /serv"
,
$CRLF
,
"Allow: /~mak"
,
$LF
,
"Disallow: /"
,
$CRLF
;
my
$robot_rules
= create_robot_rules(
"WebCrawler/3.0"
,
$robots_txt
);
$robot_rules
= create_robot_rules(
"Unknown/1.0"
,
$robots_txt
);
}
sub
test_multi_agent_names {
my
$robots_txt
=
"User-agent: crawler1 crawler2"
.
$CRLF
.
"Disallow: /index.html"
.
$CRLF
.
"Allow: /"
;
my
$robot_rules
= create_robot_rules(
"crawler2"
,
$robots_txt
);
}
sub
test_multi_matches {
my
$robots_txt
=
"User-agent: crawlerbot"
.
$CRLF
.
"Disallow: /index.html"
.
$CRLF
.
"Allow: /"
.
$CRLF
.
$CRLF
.
"User-agent: crawler"
.
$CRLF
.
"Disallow: /"
;
my
$robot_rules
= create_robot_rules(
"crawlerbot"
,
$robots_txt
);
}
sub
test_multi_wildcard {
my
$robots_txt
=
"User-agent: *"
.
$CRLF
.
"Disallow: /index.html"
.
$CRLF
.
"Allow: /"
.
$CRLF
.
$CRLF
.
"User-agent: *"
.
$CRLF
.
"Disallow: /"
;
my
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
$robots_txt
);
}
sub
test_multi_word_agent_name {
my
$robots_txt
=
"User-agent: Download Ninja"
.
$CRLF
.
"Disallow: /index.html"
.
$CRLF
.
"Allow: /"
;
my
$robot_rules
= create_robot_rules(
"Download Ninja"
,
$robots_txt
);
}
sub
test_non_ascii_encoding {
my
$robots_txt
=
join
$CRLF
,
"User-agent: *"
,
" # \u00A2 \u20B5"
,
"Disallow:"
;
my
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
$robots_txt
);
}
sub
test_nutch_cases {
my
$nutch_robots_txt
=
join
(
$CR
,
"User-Agent: Agent1 #foo"
,
"Disallow: /a"
,
"Disallow: /b/a"
,
"#Disallow: /c"
,
""
,
""
,
"User-Agent: Agent2 Agent3#foo"
,
"User-Agent: Agent4"
,
"Disallow: /d"
,
"Disallow: /e/d/"
,
""
,
"User-Agent: *"
,
"Disallow: /foo/bar/"
,
""
);
my
$robot_rules
= create_robot_rules(
"Agent1"
,
$nutch_robots_txt
);
$robot_rules
= create_robot_rules(
"Agent2"
,
$nutch_robots_txt
);
$robot_rules
= create_robot_rules(
"Agent3"
,
$nutch_robots_txt
);
$robot_rules
= create_robot_rules(
"Agent4"
,
$nutch_robots_txt
);
$robot_rules
= create_robot_rules(
"Agent5"
,
$nutch_robots_txt
);
$robot_rules
= create_robot_rules(
"Agent5,Agent2,Agent1,Agent3,*"
,
$nutch_robots_txt
);
}
sub
test_query_param_in_disallow {
my
$robots_txt
=
join
(
$CRLF
,
"User-agent: *"
,
"Disallow: /index.cfm?fuseaction=sitesearch.results*"
);
my
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
$robots_txt
);
0,
'query param in disallow'
);
}
sub
test_relative_sitemap {
my
$robot_rules
=
create_robot_rules(
"bot1"
,
path(__FILE__)->parent->child(
"./robots/relative-sitemap-robots.txt"
)
->slurp_utf8);
is(
$robot_rules
->sitemaps_size, 1,
"Found sitemap"
);
}
sub
test_rfp_cases {
my
$robots_txt
=
'# comments to webmaster@fict.org'
,
""
,
"User-agent: unhipbot"
,
"Disallow: /"
,
""
,
""
,
"User-agent: webcrawler"
,
"User-agent: excite"
,
"Disallow: "
,
""
,
"User-agent: *"
,
"Disallow: /org/plans.html"
,
"Allow: /org/"
,
"Allow: /serv"
,
"Allow: /~mak"
,
"Disallow: /"
,
""
;
my
$robot_rules
= create_robot_rules(
"UnhipBot/0.1"
,
$robots_txt
);
$robot_rules
= create_robot_rules(
"WebCrawler/3.0"
,
$robots_txt
);
$robot_rules
= create_robot_rules(
"Excite/1.0"
,
$robots_txt
);
$robot_rules
= create_robot_rules(
"Unknown/1.0"
,
$robots_txt
);
}
sub
test_robots_text_always_allowed {
my
$robots_txt
=
join
(
$CRLF
,
"User-agent: *"
,
"Disallow: /"
);
my
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
""
);
}
sub
test_robots_with_utf8_bom {
my
$robot_rules
= create_robot_rules(
"foobot"
,
path(__FILE__)->parent->child(
"robots/robots-with-utf8-bom.txt"
)
->slurp_raw
);
"Disallow match against *"
);
}
sub
test_robots_with_utf16le_bom {
my
$robot_rules
= create_robot_rules(
"foobot"
,
path(__FILE__)->parent->child(
"robots/robots-with-utf16le-bom.txt"
)
->slurp_raw
);
"Disallow match against *"
);
}
sub
test_robots_with_utf16be_bom {
my
$robot_rules
= create_robot_rules(
"foobot"
,
path(__FILE__)->parent->child(
"robots/robots-with-utf16be-bom.txt"
)
->slurp_raw
);
"Disallow match against *"
);
}
sub
test_simplest_allow_all {
my
$robots_txt
=
join
$CRLF
,
"User-agent: *"
,
"Disallow:"
;
my
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
$robots_txt
);
}
sub
test_sitemap_at_end_of_file {
my
$robots_txt
=
"User-agent: a"
.
$CRLF
.
"Disallow: /content/dam/"
.
$CRLF
.
$CRLF
.
"User-agent: b"
.
$CRLF
.
"Disallow: /content/dam/"
.
$CRLF
.
$CRLF
.
"User-agent: c"
.
$CRLF
.
"Disallow: /content/dam/"
.
my
$robot_rules
= create_robot_rules(
"a"
,
$robots_txt
);
is(
$robot_rules
->sitemaps_size, 1);
}
sub
test_sitemap {
my
$robot_rules
=
create_robot_rules(
"bot1"
,
path(__FILE__)->parent->child(
"./robots/sitemap-robots.txt"
)
->slurp_utf8);
is(
$robot_rules
->sitemaps_size, 3,
"Found sitemap"
);
my
$url
=
$robot_rules
->get_sitemap( 2 );
isnt(
$url
,
lc
(
$url
),
"Sitemap case check"
);
}
sub
test_spaces_in_multiple_user_agent_names {
my
$robots_txt
=
"User-agent: One, Two, Three"
.
$CRLF
.
"Disallow: /"
.
$CRLF
.
""
.
$CRLF
.
"User-agent: *"
.
$CRLF
.
"Allow: /"
.
$CRLF
;
my
$robot_rules
= create_robot_rules(
"One"
,
$robots_txt
);
$robot_rules
= create_robot_rules(
"Two"
,
$robots_txt
);
$robot_rules
= create_robot_rules(
"Three"
,
$robots_txt
);
$robot_rules
= create_robot_rules(
"Any-darn-crawler"
,
$robots_txt
);
}
sub
test_unsupported_fields {
my
$robots_txt
=
"User-agent: crawler1"
.
$CRLF
.
"Disallow: /index.html"
.
$CRLF
.
"Allow: /"
.
$CRLF
.
"newfield: 234"
.
$CRLF
.
"User-agent: crawler2"
.
$CRLF
.
"Disallow: /"
;
my
$robot_rules
= create_robot_rules(
"crawler2"
,
$robots_txt
);
}