news4 - RSS aggrigation system
Revision | 108d48fa43fc49064b94b1d5ed7236fa89bd9781 (tree) |
---|---|
Time | 2012-10-03 05:24:10 |
Author | hylom <hylom@hylo...> |
Commiter | hylom |
implement some filters
@@ -38,19 +38,30 @@ class FeedFetcher(object): | ||
38 | 38 | return None |
39 | 39 | return entry |
40 | 40 | |
41 | + def _apply_filters(self, filters, entries): | |
42 | + for f in filters: | |
43 | + entry_filter = self._get_filter(f) | |
44 | + entries = [entry_filter(x) for x in entries] | |
45 | + # remove entry which is None | |
46 | + entries = [x for x in entries if x] | |
47 | + return entries | |
48 | + | |
49 | + def _apply_pre_filters(self, entries): | |
50 | + return self._apply_filters(config['pre_filters'], entries) | |
51 | + | |
52 | + def _apply_post_filters(self, entries): | |
53 | + return self._apply_filters(config['post_filters'], entries) | |
54 | + | |
41 | 55 | def get_entries(self): |
42 | 56 | 'get entries' |
43 | 57 | entries = self._fetch() |
44 | - entries = [self._embeded_filter(x) for x in entries] | |
45 | - entries = [x for x in entries if x] | |
58 | + entries = self._apply_pre_filters(entries) | |
46 | 59 | |
47 | 60 | if 'filter' in self._feed: |
48 | 61 | filters = self._feed.get('filter', None) |
49 | - for filter in filters: | |
50 | - entry_filter = self._get_filter(filter) | |
51 | - entries = [entry_filter(x) for x in entries] | |
52 | - # remove entry which is None | |
53 | - entries = [x for x in entries if x] | |
62 | + entries = self._apply_filters(filters, entries) | |
63 | + | |
64 | + entries = self._apply_post_filters(entries) | |
54 | 65 | return entries |
55 | 66 | |
56 | 67 | def _get_filter(self, filter_name): |
@@ -65,8 +76,16 @@ class FeedFetcher(object): | ||
65 | 76 | globals(), |
66 | 77 | locals(), |
67 | 78 | [filter_name,]) |
68 | - mod = mods.__getattribute__(filter_name) | |
79 | + try: | |
80 | + mod = mods.__getattribute__(filter_name) | |
81 | + except AttributeError: | |
82 | + raise FilterError(filter_name) | |
69 | 83 | |
70 | 84 | # return module's entry_filter function |
71 | 85 | return mod.entry_filter |
72 | 86 | |
87 | +class FilterError(Exception): | |
88 | + def __init__(self, value): | |
89 | + self.value = value | |
90 | + def __str__(self): | |
91 | + return 'filter "' + self.value + '" is not found.' |
@@ -0,0 +1,27 @@ | ||
1 | +# filter for Image extraction | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +import re | |
5 | + | |
6 | +re_blank = re.compile(r'<\s*(\w+)[^>]*>\s*</\s*\1\s*>') | |
7 | +re_br = re.compile(r'<\s*br\s*/?>') | |
8 | + | |
9 | +def _replace_all(rex, text): | |
10 | + m = rex.search(text) | |
11 | + while(m): | |
12 | + text = rex.sub('', text) | |
13 | + m = rex.search(text) | |
14 | + return text | |
15 | + | |
16 | +def entry_filter(entry): | |
17 | + body = entry['body'] | |
18 | + | |
19 | + # 空のタグを削除 | |
20 | + body = _replace_all(re_blank, body) | |
21 | + | |
22 | + # brタグを削除 | |
23 | + body = _replace_all(re_br, body) | |
24 | + | |
25 | + entry['body'] = body | |
26 | + return entry | |
27 | + |
@@ -0,0 +1,30 @@ | ||
1 | +# filter for slashdot.jp | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +import re | |
5 | + | |
6 | +re_read_all = re.compile(ur'''<p>\s*<a href=['"][^'"]+['"]>\s*すべて読む\s*</a>.*?</p>''') | |
7 | +re_related = re.compile(ur'''<p>\s*関連ストーリー:.*?</p>''') | |
8 | +re_topics = re.compile(ur'''<a href="http://slashdot.jp/stories/\w+">(.*?)</a>''') | |
9 | + | |
10 | +def entry_filter(entry): | |
11 | + # すべて読む、関連ストーリーを削除 | |
12 | + body = entry['body'] | |
13 | + topics = [] | |
14 | + m = re_read_all.search(body) | |
15 | + if m: | |
16 | + s = m.group(0) | |
17 | + itr = re_topics.findall(s) | |
18 | + for items in itr: | |
19 | + topics.append(items) | |
20 | + | |
21 | + body = re_read_all.sub('', body) | |
22 | + body = re_related.sub('', body) | |
23 | + if 'tags' in entry: | |
24 | + entry['tags'].extend(topics) | |
25 | + else: | |
26 | + entry['tags'] = topics | |
27 | + entry['body'] = body | |
28 | + | |
29 | + return entry | |
30 | + |
@@ -0,0 +1,12 @@ | ||
1 | +# PR filter | |
2 | +# -*- coding: utf-8 -*- | |
3 | + | |
4 | +import re | |
5 | + | |
6 | +def entry_filter(entry): | |
7 | + if re.search(u'^(PR|AD)(:|:)', entry['title']): | |
8 | + print 'delete PR entry - %s' % entry['title'] | |
9 | + return None | |
10 | + return entry | |
11 | + | |
12 | + |
@@ -64,13 +64,13 @@ | ||
64 | 64 | <span>タグ:</span> |
65 | 65 | % for tag in entry.tags: |
66 | 66 | <span>${tag} </span> |
67 | + % endfor | |
67 | 68 | % if 'images' in entry: |
68 | 69 | <span>画像:</span> |
69 | 70 | % for imgurl in entry.images: |
70 | 71 | <span><a href="${imgurl}">*</a></span> |
71 | 72 | % endfor |
72 | 73 | % endif |
73 | - % endfor | |
74 | 74 | </div> |
75 | 75 | </div> |
76 | 76 | </div> |