————# The (extremely complex) rules for domain delegation.
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
=head1 NAME
Mail::SpamAssassin::RegistryBoundaries - domain delegation rules
=cut
use
strict;
use
warnings;
# use bytes;
our
@ISA
=
qw()
;
# called from SpamAssassin->init() to create $self->{util_rb}
sub
new {
my
$class
=
shift
;
$class
=
ref
(
$class
) ||
$class
;
my
(
$main
) =
@_
;
my
$self
= {
'main'
=>
$main
,
'conf'
=>
$main
->{conf},
};
bless
(
$self
,
$class
);
# Initialize valid_tlds_re for schemeless uri parsing, FreeMail etc
if
(
$self
->{conf}->{valid_tlds} && %{
$self
->{conf}->{valid_tlds}}) {
# International domain names are already in ASCII-compatible encoding (ACE)
my
$tlds
=
'(?<![a-zA-Z0-9-])(?:'
.
# make sure tld starts at boundary
join
(
'|'
,
keys
%{
$self
->{conf}->{valid_tlds}}).
')(?!(?:[a-zA-Z0-9-]|\.[a-zA-Z0-9]))'
;
# make sure it ends
# Perl 5.10+ trie optimizes lists, no need for fancy regex optimizing
if
(
eval
{
$self
->{valid_tlds_re} =
qr/$tlds/
i; 1; }) {
dbg(
"config: registryboundaries: %d tlds loaded"
,
scalar
keys
%{
$self
->{conf}->{valid_tlds}});
}
else
{
warn
"config: registryboundaries: failed to compile valid_tlds_re: $@\n"
;
$self
->{valid_tlds_re} =
qr/no_tlds_defined/
;
}
}
else
{
# Failsafe in case no tlds defined, we don't want this to match everything..
$self
->{valid_tlds_re} =
qr/no_tlds_defined/
;
warn
"config: registryboundaries: no tlds defined, need to run sa-update\n"
if
!
$self
->{main}->{ignore_site_cf_files};
}
$self
;
}
# This is required because the .us domain is nuts. See split_domain.
our
%US_STATES
=
qw(
ak 1 al 1 ar 1 az 1 ca 1 co 1 ct 1 dc 1 de 1 fl 1 ga 1 gu 1 hi 1 ia 1 id 1 il 1 in 1 ks 1 ky 1 la 1 ma 1 md 1 me 1 mi 1
mn 1 mo 1 ms 1 mt 1 nc 1 nd 1 ne 1 nh 1 nj 1 nm 1 nv 1 ny 1 oh 1 ok 1 or 1 pa 1 pr 1 ri 1 sc 1 sd 1 tn 1 tx 1 ut 1 va 1
vi 1 vt 1 wa 1 wi 1 wv 1 wy 1
)
;
###########################################################################
=head1 METHODS
=over 4
=item ($hostname, $domain) = split_domain ($fqdn, $is_ascii)
Cut a fully-qualified hostname into the hostname part and the domain
part, splitting at the DNS registry boundary.
Examples:
"www.foo.com" => ( "www", "foo.com" )
"www.foo.co.uk" => ( "www", "foo.co.uk" )
If $is_ascii given and true, skip idn_to_ascii() conversion
=cut
sub
split_domain {
my
(
$self
,
$domain
,
$is_ascii
) =
@_
;
if
(
$is_ascii
) {
utf8::encode(
$domain
)
if
utf8::is_utf8(
$domain
);
# force octets
$domain
=
lc
$domain
;
}
else
{
# convert to ascii, handles Unicode dot normalization also
$domain
= idn_to_ascii(
$domain
);
}
my
$hostname
=
''
;
if
(
defined
$domain
&&
$domain
ne
''
) {
# www..spamassassin.org -> www.spamassassin.org
$domain
=~
tr
/././s;
# leading/trailing dots
$domain
=~ s/^\.+//;
$domain
=~ s/\.+$//;
# Split scalar domain into components
my
@domparts
=
split
(/\./,
$domain
);
my
@hostname
;
while
(
@domparts
> 1) {
# go until we find the TLD
if
(
@domparts
== 2) {
# co.uk, etc.
my
$temp
=
join
(
"."
,
@domparts
);
# International domain names in ASCII-compatible encoding (ACE)
last
if
(
$self
->{conf}->{two_level_domains}{
$temp
});
}
elsif
(
@domparts
== 3) {
# demon.co.uk
# esc.edu.ar
# [^\.]+\.${US_STATES}\.us
if
(
$domparts
[2] eq
'us'
) {
last
if
(
$US_STATES
{
$domparts
[1]});
}
else
{
my
$temp
=
join
(
"."
,
@domparts
);
# International domain names in ASCII-compatible encoding (ACE)
last
if
(
$self
->{conf}->{three_level_domains}{
$temp
});
}
}
elsif
(
@domparts
== 4) {
if
(
$domparts
[3] eq
'us'
&&
((
$domparts
[0] eq
'pvt'
&&
$domparts
[1] eq
'k12'
) ||
(
$domparts
[0] =~ /^c[io]$/)))
{
# "Fire-Dept.CI.Los-Angeles.CA.US"
# "<school-name>.PVT.K12.<state>.US"
last
if
(
$US_STATES
{
$domparts
[2]});
}
}
push
(
@hostname
,
shift
@domparts
);
}
# Look for a sub-delegated TLD
# use @domparts to skip trying to match on TLDs that can't possibly
# match, but keep in mind that the hostname can be blank, so 4TLD needs 4,
# 3TLD needs 3, 2TLD needs 2 ...
#
unshift
@domparts
,
pop
@hostname
if
@hostname
;
$domain
=
join
(
"."
,
@domparts
);
$hostname
=
join
(
"."
,
@hostname
);
}
(
$hostname
,
$domain
);
}
###########################################################################
=item $domain = trim_domain($fqdn, $is_ascii)
Cut a fully-qualified hostname into the hostname part and the domain
part, returning just the domain.
Examples:
"www.foo.com" => "foo.com"
"www.foo.co.uk" => "foo.co.uk"
If $is_ascii given and true, skip idn_to_ascii() conversion
=cut
sub
trim_domain {
my
(
$self
,
$domain
,
$is_ascii
) =
@_
;
my
(
undef
,
$dom
) =
$self
->split_domain(
$domain
,
$is_ascii
);
return
$dom
;
}
###########################################################################
=item $ok = is_domain_valid($dom, $is_ascii)
Return C<1> if the domain/hostname uses valid known TLD, C<undef> otherwise.
If $is_ascii given and true, skip idn_to_ascii() conversion.
Note that this only checks the TLD validity and nothing else. To verify
that the complete fqdn is in a valid legal format, Util::is_fqdn_valid() can
additionally be used.
=back
=cut
sub
is_domain_valid {
my
(
$self
,
$dom
,
$is_ascii
) =
@_
;
return
0
unless
defined
$dom
;
if
(
$is_ascii
) {
utf8::encode(
$dom
)
if
utf8::is_utf8(
$dom
);
# force octets
$dom
=
lc
$dom
;
}
else
{
# convert to ascii, handles Unicode dot normalization also
$dom
= idn_to_ascii(
$dom
);
}
# domains don't have whitespace
return
0
if
(
$dom
=~ /\s/);
# ensure it ends in a known-valid TLD, and has at least 1 dot
return
0
unless
(
$dom
=~ /\.([^.]+)$/);
return
0
unless
exists
$self
->{conf}->{valid_tlds}{$1};
return
1;
# nah, it's ok.
}
#
sub
uri_to_domain {
my
$self
=
shift
;
my
$uri
=
lc
shift
;
# Javascript is not going to help us, so return.
# Likewise ignore cid, file
return
if
(
$uri
=~ /^(?:javascript|cid|file):/);
$uri
=~ s/\?.*//;
# drop parameters ?subject= etc
# note above, Outlook linkifies foo@bar%2Ecom&x.com to foo@bar.com !!
# uri_list_canonicalize should have made versions without ? &
# Keep testing with & here just in case..
return
if
$uri
=~ /\@.*?\@/;
# abort if multiple @
return
unless
$uri
=~ s/.*@//;
# drop username or abort
}
else
{
$uri
=~ s{^[a-z]+:/{0,2}}{}gs;
# drop the protocol
# strip path, CGI params, fragment. note: bug 4213 shows that "&" should
# *not* be likewise stripped here -- it's permitted in hostnames by
# some common MUAs!
$uri
=~ s{[/?
#].*}{}gs;
$uri
=~ s{^[^/]*\@}{}gs;
# drop username/passwd
$uri
=~ s{:\d*$}{}gs;
# port, bug 4191: sometimes the # is missing
}
# skip undecoded URIs if the encoded bits shouldn't be.
# we'll see the decoded version as well. see url_encode()
return
if
$uri
=~ /\%(?:2[1-9a-f]|[3-6][0-9a-f]|7[0-9a-e])/;
my
$host
= idn_to_ascii(
$uri
);
# unstripped/full domain name
my
$domain
=
$host
;
# keep IPs intact
if
(
$host
!~ IS_IP_ADDRESS) {
# check that it's a valid hostname/fqdn
return
unless
is_fqdn_valid(
$host
, 1);
# ignore invalid TLDs
return
unless
$self
->is_domain_valid(
$host
, 1);
# get rid of hostname part of domain, understanding delegation
$domain
=
$self
->trim_domain(
$host
, 1);
}
# optionally return unstripped host name
return
!
wantarray
?
$domain
: (
$domain
,
$host
);
}
1;