package Mail::SpamAssassin::Pyzor::Digest::Pieces; # Copyright 2018 cPanel, LLC. # All rights reserved. # http://cpanel.net # # <@LICENSE> # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to you under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at: # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # </@LICENSE> # use strict; use warnings; =encoding utf-8 =head1 NAME Mail::SpamAssassin::Pyzor::Digest::Pieces - Pyzor backend logic module =head1 DESCRIPTION This module houses backend logic for L<Mail::SpamAssassin::Pyzor::Digest>. It reimplements logic found in pyzor's F<digest.py> module (L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>). =cut #---------------------------------------------------------------------- use Encode (); our $VERSION = '0.03'; # each tuple is [ offset, length ] use constant _HASH_SPEC => ( [ 20, 3 ], [ 60, 3 ] ); use constant { _MIN_LINE_LENGTH => 8, _ATOMIC_NUM_LINES => 4, }; #---------------------------------------------------------------------- =head1 FUNCTIONS =head2 $strings_ar = digest_payloads( $EMAIL_MIME ) This imitates the corresponding object method in F<digest.py>. It returns a reference to an array of strings. Each string can be either a byte string or a character string (e.g., UTF-8 decoded). NB: RFC 2822 stipulates that message bodies should use CRLF line breaks, not plain LF (nor plain CR). We will thus convert any plain CRs in a quoted-printable message body into CRLF. Python, though, doesn't do this, so the output of our implementation of C<digest_payloads()> diverges from that of the Python original. It doesn't ultimately make a difference since the line-ending whitespace gets trimmed regardless, but it's necessary to factor in when comparing the output of our implementation with the Python output. =cut sub digest_payloads { my ($parsed) = @_; my @subparts; foreach my $part ($parsed->find_parts(qr/./, 1)) { push(@subparts, $part); } my @payloads; foreach my $p (@subparts) { my ( $main_type, $subtype, $encoding, $encode_check ) = parse_content_type( $p->{'type'} ); my $payload; if ( $main_type eq 'text' ) { if ( $subtype eq 'plain' ) { $payload = $p->{'decoded'}; $payload =~ s/\\'/\'/gx; } else { $payload = $p->{'rendered'}; } utf8::upgrade($payload) if defined $payload; if ( $subtype eq 'html' ) { require Mail::SpamAssassin::Pyzor::Digest::StripHtml; $payload = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($payload); } } else { # This does no decoding, even of, e.g., quoted-printable or base64. $payload = $p->{'pristine_body'}; } next if not defined $payload; push @payloads, $payload; } return \@payloads; } #---------------------------------------------------------------------- =head2 normalize( $STRING ) This imitates the corresponding object method in F<digest.py>. It modifies C<$STRING> in-place. As with the original implementation, if C<$STRING> contains (decoded) Unicode characters, those characters will be parsed accordingly. So: $str = "123\xc2\xa0"; # [ c2 a0 ] == \u00a0, non-breaking space normalize($str); The above will leave C<$str> alone, but this: utf8::decode($str); normalize($str); ... will trim off the last two bytes from C<$str>. =cut sub normalize { ## no critic qw( Subroutines::RequireArgUnpacking ) # NULs are bad, mm-kay? $_[0] =~ tr<\0><>d; # NB: Python's \s without re.UNICODE is the same as Perl's \s # with the /a modifier. # # https://docs.python.org/2/library/re.html # https://perldoc.perl.org/perlrecharclass.html#Backslash-sequences # Python: re.compile(r'\S{10,}') $_[0] =~ s<\S{10,}><>ag; # Python: re.compile(r'\S+@\S+') $_[0] =~ s<\S+ @ \S+><>agx; # Python: re.compile(r'[a-z]+:\S+', re.IGNORECASE) $_[0] =~ s<[a-zA-Z]+ : \S+><>agx; # (from digest.py ...) # Make sure we do the whitespace last because some of the previous # patterns rely on whitespace. $_[0] =~ tr< \x09-\x0d><>d; # This is fun. digest.py's normalize() does a non-UNICODE whitespace # strip, then calls strip() on the string, which *will* strip Unicode # whitespace from the ends. $_[0] =~ s<\A\s+><>; $_[0] =~ s<\s+\z><>; return; } #---------------------------------------------------------------------- =head2 $yn = should_handle_line( $STRING ) This imitates the corresponding object method in F<digest.py>. It returns a boolean. =cut sub should_handle_line { return $_[0] && length( $_[0] ) >= _MIN_LINE_LENGTH(); } #---------------------------------------------------------------------- =head2 $sr = assemble_lines( \@LINES ) This assembles a string buffer out of @LINES. The string is the buffer of octets that will be hashed to produce the message digest. Each member of @LINES is expected to be an B<octet string>, not a character string. =cut sub assemble_lines { my ($lines_ar) = @_; if ( @$lines_ar <= _ATOMIC_NUM_LINES() ) { # cf. handle_atomic() in digest.py return \join( q<>, @$lines_ar ); } #---------------------------------------------------------------------- # cf. handle_atomic() in digest.py my $str = q<>; for my $ofs_len ( _HASH_SPEC() ) { my ( $offset, $length ) = @$ofs_len; for my $i ( 0 .. ( $length - 1 ) ) { my $idx = int( $offset * @$lines_ar / 100 ) + $i; next if !defined $lines_ar->[$idx]; $str .= $lines_ar->[$idx]; } } return \$str; } #---------------------------------------------------------------------- =head2 ($main, $sub, $encoding, $checkval) = parse_content_type( $CONTENT_TYPE ) =cut use constant _QUOTED_PRINTABLE_NAMES => ( "quopri-codec", "quopri", "quoted-printable", "quotedprintable", ); # Make Encode::decode() ignore anything that doesn't fit the # given encoding. use constant _encode_check_ignore => q<>; sub parse_content_type { my ($content_type) = @_; # text/plain; charset=us-ascii my $ct_parse; if($content_type =~ /(\w+)\/(\w+); charset=(.*)/) { $ct_parse->{type} = $1; $ct_parse->{subtype} = $2; $ct_parse->{'attributes'}{'charset'} = $3; } elsif($content_type =~ /(\w+)\/(\w+)/) { $ct_parse->{type} = $1; $ct_parse->{subtype} = $2; $ct_parse->{'attributes'}{'charset'} = 'us-ascii'; } else { $ct_parse->{type} = 'text'; $ct_parse->{subtype} = 'plain'; $ct_parse->{'attributes'}{'charset'} = 'us-ascii'; } my $main = $ct_parse->{'type'} || q<>; my $sub = $ct_parse->{'subtype'} || q<>; my $encoding = $ct_parse->{'attributes'}{'charset'}; my $checkval; if ($encoding) { # Lower-case everything, convert underscore to dash, and remove NUL. $encoding =~ tr<A-Z_\0><a-z->d; # Apparently pyzor accommodates messages that put the transfer # encoding in the Content-Type. if ( grep { $_ eq $encoding } _QUOTED_PRINTABLE_NAMES() ) { $checkval = Encode::FB_CROAK(); } } else { $encoding = 'ascii'; } # Match Python .decode()'s 'ignore' behavior $checkval ||= \&_encode_check_ignore; return ( $main, $sub, $encoding, $checkval ); } #---------------------------------------------------------------------- =head2 @lines = splitlines( $TEXT ) Imitates C<str.splitlines()>. (cf. C<pydoc str>) Returns a plain list in list context. Returns the number of items to be returned in scalar context. =cut sub splitlines { return split m<\r\n?|\n>, $_[0] if defined $_[0]; } 1;