lib/Mail/SpamAssassin/Message/Metadata.pm


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
—
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
—
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
              # $Id: MIME.pm,v 1.8 2003/10/02 22:59:00 quinlan Exp $
# <@LICENSE>
# Copyright 2004 Apache Software Foundation
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>
=head1 NAME
Mail::SpamAssassin::Message::Metadata - extract metadata from a message
=head1 SYNOPSIS
=head1 DESCRIPTION
This class is tasked with extracting "metadata" from messages for use as
Bayes tokens, fodder for eval tests, or other rules.  Metadata is
supplemental data inferred from the message, like the examples below.
It is held in two forms:
1. as name-value pairs of strings, presented in mail header format.  For
  example, "X-Language" => "en".  This is the general form for simple
  metadata that's useful as Bayes tokens, can be added to marked-up
  messages using "add_header", etc., such as the trusted-relay inference
  and language detection.
2. as more complex data structures on the $msg->{metadata} object.  This
  is the form used for metadata like the HTML parse data, which is stored
  there for access by eval rule code.   Because it's not simple strings,
  it's not added as a Bayes token by default (Bayes needs simple strings).
=head1 PUBLIC METHODS
=over 4
=cut
package Mail::SpamAssassin::Message::Metadata;
use strict;
use bytes;
use Mail::SpamAssassin;
use Mail::SpamAssassin::Constants qw(:sa);
use Mail::SpamAssassin::TextCat;
use Mail::SpamAssassin::Message::Metadata::Received;
=item new()
=cut
sub new {
  my ($class, $msg) = @_;
  $class = ref($class) || $class;
  my $self = {
    msg =>           $msg,
    strings =>               { }
  };
  bless($self,$class);
  $self;
}
sub extract {
  my ($self, $msg, $main) = @_;
  # pre-chew Received headers
  $self->parse_received_headers ($main, $msg);
  # and identify the language (if we're going to do that), before we
  # run any Bayes tests, so they can use that as a token
  $self->check_language($main);
  $main->call_plugins ("extract_metadata", { msg => $msg });
}
sub finish {
  my ($self) = @_;
  delete $self->{msg};
  delete $self->{strings};
}
# ---------------------------------------------------------------------------
sub check_language {
  my ($self, $main) = @_;
  my @languages = split (' ', $main->{conf}->{ok_languages});
  if (grep { $_ eq "all" } @languages) {
    # user doesn't care what lang it's in, so return.
    # TODO: might want to have them as bayes tokens all the same, though.
    # should we add a new config setting to control that?  or make it a
    # plugin?
    return;
  }
  my $body = $self->{msg}->get_rendered_body_text_array();
  $body = join ("\n", @{$body});
  $body =~ s/^Subject://i;
  my $len = length($body);
  # truncate after 10k; that should be plenty to classify it
  if ($len > 10000) {
    substr ($body, 10000) = '';
    $len = 10000;
  }
  # note body text length, since the check_languages() eval rule also
  # uses it
  $self->{languages_body_len} = $len;
  # need about 256 bytes for reasonably accurate match (experimentally derived)
  if ($len < 256) {
    dbg("Message too short for language analysis");
    $self->{textcat_matches} = [];
    return;
  }
  my @matches = Mail::SpamAssassin::TextCat::classify($self,
                                \$body, $main->{languages_filename});
  undef $body;          # free that memory
  $self->{textcat_matches} = \@matches;
  my $matches_str = join(' ', @matches);
  # add to metadata so Bayes gets to take a look
  $self->{msg}->put_metadata ("X-Languages", $matches_str);
  dbg ("metadata: X-Languages: $matches_str");
}
# ---------------------------------------------------------------------------
#sub dbg { Mail::SpamAssassin::dbg(@_); }
1;
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)