1 |
neurogeek 08/11/28 01:39:20 |
2 |
|
3 |
Added: reverend-0.3-email.patch |
4 |
Log: |
5 |
Initial commit. Thanks to David Guerizec for ebuild |
6 |
(Portage version: 2.2_rc12/cvs/Linux 2.6.18-gentoo-r3 i686) |
7 |
|
8 |
Revision Changes Path |
9 |
1.1 dev-python/reverend/files/reverend-0.3-email.patch |
10 |
|
11 |
file : http://sources.gentoo.org/viewcvs.py/gentoo-x86/dev-python/reverend/files/reverend-0.3-email.patch?rev=1.1&view=markup |
12 |
plain: http://sources.gentoo.org/viewcvs.py/gentoo-x86/dev-python/reverend/files/reverend-0.3-email.patch?rev=1.1&content-type=text/plain |
13 |
|
14 |
Index: reverend-0.3-email.patch |
15 |
=================================================================== |
16 |
--- reverend/guessers/email.py 2006-04-25 00:15:27.000000000 +0200 |
17 |
+++ reverend/guessers/email.py 2006-04-25 01:12:16.000000000 +0200 |
18 |
@@ -9,7 +9,6 @@ |
19 |
import email |
20 |
|
21 |
from reverend.thomas import Bayes |
22 |
-from reverend.splitter import Splitter |
23 |
|
24 |
|
25 |
class EmailClassifier(Bayes): |
26 |
@@ -19,19 +18,22 @@ |
27 |
# This should return a list of strings |
28 |
# which will be used as the key into |
29 |
# the table of token counts |
30 |
- tokens = self.getHeaderTokens(msg) |
31 |
- tokens += self.getBodyTokens(msg) |
32 |
- |
33 |
+ for tok in self.getHeaderTokens(msg): |
34 |
+ yield tok |
35 |
+ |
36 |
+ for tok in self.getBodyTokens(msg): |
37 |
+ yield tok |
38 |
+ |
39 |
# Get some tokens that are generated from the |
40 |
# header and the structure |
41 |
- tokens += self.getMetaTokens(msg) |
42 |
- return tokens |
43 |
+ for tok in self.getMetaTokens(msg): |
44 |
+ yield tok |
45 |
|
46 |
def getBodyTokens(self, msg): |
47 |
text = self.getTextPlain(msg) |
48 |
if text is None: |
49 |
text = '' |
50 |
- tl = self.splitter.split(text) |
51 |
+ tl = self._tokenizer.tokenize(text) |
52 |
return tl |
53 |
|
54 |
def getHeaderTokens(self, msg): |
55 |
@@ -40,12 +42,12 @@ |
56 |
text += msg.get('from','fromnoone') + ' ' |
57 |
text += msg.get('to','tonoone') + ' ' |
58 |
text += msg.get('cc','ccnoone') + ' ' |
59 |
- tl = self.splitter.split(text) |
60 |
+ tl = self._tokenizer.tokenize(text) |
61 |
return tl |
62 |
|
63 |
def getTextPlain(self, msg): |
64 |
for part in msg.walk(): |
65 |
- typ = part.get_type() |
66 |
+ typ = part.get_content_type() |
67 |
if typ and typ.lower() == "text/plain": |
68 |
text = part.get_payload(decode=True) |
69 |
return text |
70 |
@@ -53,7 +55,7 @@ |
71 |
|
72 |
def getTextHtml(self, msg): |
73 |
for part in msg.walk(): |
74 |
- typ = part.get_type() |
75 |
+ typ = part.get_content_type() |
76 |
if typ and typ.lower() == "text/html": |
77 |
text = part.get_payload(decode=False) |
78 |
return text |