news4
Fork

(Original repository, No fork origin)

Commit

implement some filters

--- a/fetcher.py

+++ b/fetcher.py

		@@ -38,19 +38,30 @@ class FeedFetcher(object):
38	38	return None
39	39	return entry
40	40
	41	+ def _apply_filters(self, filters, entries):
	42	+ for f in filters:
	43	+ entry_filter = self._get_filter(f)
	44	+ entries = [entry_filter(x) for x in entries]
	45	+ # remove entry which is None
	46	+ entries = [x for x in entries if x]
	47	+ return entries
	48	+
	49	+ def _apply_pre_filters(self, entries):
	50	+ return self._apply_filters(config['pre_filters'], entries)
	51	+
	52	+ def _apply_post_filters(self, entries):
	53	+ return self._apply_filters(config['post_filters'], entries)
	54	+
41	55	def get_entries(self):
42	56	'get entries'
43	57	entries = self._fetch()
44		- entries = [self._embeded_filter(x) for x in entries]
45		- entries = [x for x in entries if x]
	58	+ entries = self._apply_pre_filters(entries)
46	59
47	60	if 'filter' in self._feed:
48	61	filters = self._feed.get('filter', None)
49		- for filter in filters:
50		- entry_filter = self._get_filter(filter)
51		- entries = [entry_filter(x) for x in entries]
52		- # remove entry which is None
53		- entries = [x for x in entries if x]
	62	+ entries = self._apply_filters(filters, entries)
	63	+
	64	+ entries = self._apply_post_filters(entries)
54	65	return entries
55	66
56	67	def _get_filter(self, filter_name):

		@@ -65,8 +76,16 @@ class FeedFetcher(object):
65	76	globals(),
66	77	locals(),
67	78	[filter_name,])
68		- mod = mods.__getattribute__(filter_name)
	79	+ try:
	80	+ mod = mods.__getattribute__(filter_name)
	81	+ except AttributeError:
	82	+ raise FilterError(filter_name)
69	83
70	84	# return module's entry_filter function
71	85	return mod.entry_filter
72	86
	87	+class FilterError(Exception):
	88	+ def __init__(self, value):
	89	+ self.value = value
	90	+ def __str__(self):
	91	+ return 'filter "' + self.value + '" is not found.'

--- /dev/null

+++ b/filters/cleanup.py

		@@ -0,0 +1,27 @@
	1	+# filter for Image extraction
	2	+# -- coding: utf-8 --
	3	+
	4	+import re
	5	+
	6	+re_blank = re.compile(r'<\s(\w+)[^>]>\s</\s\1\s*>')
	7	+re_br = re.compile(r'<\sbr\s/?>')
	8	+
	9	+def _replace_all(rex, text):
	10	+ m = rex.search(text)
	11	+ while(m):
	12	+ text = rex.sub('', text)
	13	+ m = rex.search(text)
	14	+ return text
	15	+
	16	+def entry_filter(entry):
	17	+ body = entry['body']
	18	+
	19	+ # 空のタグを削除
	20	+ body = _replace_all(re_blank, body)
	21	+
	22	+ # brタグを削除
	23	+ body = _replace_all(re_br, body)
	24	+
	25	+ entry['body'] = body
	26	+ return entry
	27	+

--- /dev/null

+++ b/filters/gizmodo.py

		@@ -0,0 +1,30 @@
	1	+# filter for slashdot.jp
	2	+# -- coding: utf-8 --
	3	+
	4	+import re
	5	+
	6	+re_read_all = re.compile(ur'''<p>\s<a href=['"][^'"]+['"]>\sすべて読む\s</a>.?</p>''')
	7	+re_related = re.compile(ur'''<p>\s関連ストーリー：.?</p>''')
	8	+re_topics = re.compile(ur'''<a href="http://slashdot.jp/stories/\w+">(.*?)</a>''')
	9	+
	10	+def entry_filter(entry):
	11	+ # すべて読む、関連ストーリーを削除
	12	+ body = entry['body']
	13	+ topics = []
	14	+ m = re_read_all.search(body)
	15	+ if m:
	16	+ s = m.group(0)
	17	+ itr = re_topics.findall(s)
	18	+ for items in itr:
	19	+ topics.append(items)
	20	+
	21	+ body = re_read_all.sub('', body)
	22	+ body = re_related.sub('', body)
	23	+ if 'tags' in entry:
	24	+ entry['tags'].extend(topics)
	25	+ else:
	26	+ entry['tags'] = topics
	27	+ entry['body'] = body
	28	+
	29	+ return entry
	30	+

--- /dev/null

+++ b/filters/pr_block.py

		@@ -0,0 +1,12 @@
	1	+# PR filter
	2	+# -- coding: utf-8 --
	3	+
	4	+import re
	5	+
	6	+def entry_filter(entry):
	7	+ if re.search(u'^(PR\|AD)(:\|：)', entry['title']):
	8	+ print 'delete PR entry - %s' % entry['title']
	9	+ return None
	10	+ return entry
	11	+
	12	+

--- a/templates/index.tmpl.html

+++ b/templates/index.tmpl.html