sfjplib for python
Revision | 2c4d4f2583be270bfc4e4652264f99ec50fb2202 (tree) |
---|---|
Time | 2011-07-28 20:00:14 |
Author | Hiromichi MATSUSHIMA <hirom@offi...> |
Commiter | Hiromichi MATSUSHIMA |
rewrite form_retriver with htmltree
@@ -4,126 +4,83 @@ | ||
4 | 4 | |
5 | 5 | import HTMLParser |
6 | 6 | import re |
7 | +import htmltree | |
7 | 8 | |
8 | -class Form(dict): | |
9 | - def __init__(self, action, method): | |
10 | - self.action = action | |
11 | - self.method = method | |
12 | - | |
13 | -class FormItem(object): | |
14 | - def __init__(self, name, attrs, value): | |
15 | - self.name = name | |
16 | - self.attrs = attrs | |
17 | - self.value = value | |
18 | - | |
19 | -class FormRetriver(HTMLParser.HTMLParser): | |
20 | - UNCLOSE = ("input") | |
9 | +class Form(list): | |
10 | + def __init__(self): | |
11 | + self.elements = [] | |
12 | + self.action = None | |
13 | + self.target = None | |
14 | + self.enctype = None | |
15 | + self.method = None | |
21 | 16 | |
17 | +class FormRetriver(object): | |
22 | 18 | def __init__(self): |
23 | 19 | "Constructor" |
24 | - HTMLParser.HTMLParser.__init__(self) | |
25 | - self._current_form = None | |
26 | - self._stack = "" | |
27 | - self._capture = False | |
28 | - self._current_element = None | |
29 | - self.forms = [] | |
20 | + self._forms = [] | |
30 | 21 | |
31 | 22 | def parse(self, data): |
32 | - self.feed(data) | |
33 | - | |
34 | - # Handlers | |
35 | - def handle_starttag(self, tag, attrs): | |
36 | - if tag == "form": | |
37 | - attr = dict(attrs) | |
38 | - action = attr.get("action", "") | |
39 | - method = attr.get("method", "") | |
40 | - f = Form(action, method) | |
41 | - self._current_form = f | |
42 | - elif tag == "input": | |
43 | - self.handle_startendtag(tag, attrs) | |
44 | - elif tag == "textarea": | |
45 | - self._stack = "" | |
46 | - self._capture = True | |
47 | - e = FormItem(tag, attrs, "") | |
48 | - self._current_element = e | |
49 | - elif tag == "option": | |
50 | - attr = dict(attrs) | |
51 | - val = attr.get("value", "") | |
52 | - e = FormItem(tag, attrs, val) | |
53 | - if "selected" in attr: | |
54 | - self._current_element.value = val | |
55 | - if "SELECTED" in attr: | |
56 | - self._current_element.value = val | |
57 | - elif tag == "select": | |
58 | - e = FormItem(tag, attrs, "") | |
59 | - self._current_element = e | |
60 | - | |
61 | - def handle_endtag(self, tag): | |
62 | - if tag == "textarea": | |
63 | - text = self._stack | |
64 | - self._stack = "" | |
65 | - self._capture = False | |
66 | - e = self._current_element | |
67 | - attr = dict(e.attrs) | |
68 | - if "NAME" in attr: | |
69 | - name = attr["NAME"] | |
70 | - elif "name" in attr: | |
71 | - name = attr["name"] | |
72 | - else: | |
73 | - name = "" | |
74 | - if name: | |
75 | - e.value = text | |
76 | - self._current_form[name] = e | |
77 | - elif tag == "form": | |
78 | - self.forms.append(self._current_form) | |
79 | - elif tag == "select": | |
80 | - e = self._current_element | |
81 | - attr = dict(e.attrs) | |
82 | - if "NAME" in attr: | |
83 | - name = attr["NAME"] | |
84 | - elif "name" in attr: | |
85 | - name = attr["name"] | |
86 | - else: | |
87 | - name = "" | |
88 | - if name: | |
89 | - self._current_form[name] = e | |
90 | - | |
91 | - def handle_data(self, data): | |
92 | - if self._capture: | |
93 | - self._stack = self._stack + data | |
23 | + tree = htmltree.parse(data) | |
24 | + r = tree.root() | |
25 | + forms = r.get_elements_by_name("form") | |
26 | + for f in forms: | |
27 | + self._forms.append(self.form_parse(f)) | |
94 | 28 | |
95 | - def handle_startendtag(self, tag, attrs): | |
96 | - if tag == "input": | |
97 | - attr = dict(attrs) | |
98 | - e = FormItem(tag, attrs, "") | |
99 | - if "NAME" in attr: | |
100 | - name = attr["NAME"] | |
101 | - elif "name" in attr: | |
102 | - name = attr["name"] | |
103 | - else: | |
104 | - name = "" | |
105 | - if "VALUE" in attr: | |
106 | - e.value = attr["VALUE"] | |
107 | - if "value" in attr: | |
108 | - e.value = attr["value"] | |
109 | - | |
110 | - if name: | |
111 | - self._current_form[name] = e | |
29 | + def convert_ref(self, text): | |
30 | + f = lambda x: self._convert_ref(x) | |
31 | + return re.sub(r"&(\w+);", f, text) | |
112 | 32 | |
113 | - def handle_entityref(self, name): | |
114 | - if self._capture: | |
115 | - self._stack = self._stack + self.convert_ref(name) | |
116 | - | |
117 | - def convert_ref(self, name): | |
33 | + def _convert_ref(self, m): | |
118 | 34 | conv_dict = dict( |
119 | 35 | lt="<", |
120 | 36 | gt=">", |
121 | 37 | amp="&", |
122 | - quot="'" | |
38 | + quot="'", | |
39 | + nbsp=" ", | |
123 | 40 | ) |
124 | - return conv_dict[name] | |
41 | + return conv_dict[m.group(1)] | |
42 | + | |
43 | + def form_parse(self, elem): | |
44 | + f = Form() | |
45 | + f.action = elem.attr("action") | |
46 | + f.target = elem.attr("target") | |
47 | + f.enctype = elem.attr("enctype") | |
48 | + f.method = elem.attr("method") | |
49 | + self._r_form_parse(elem, f) | |
50 | + return f | |
51 | + | |
52 | + def _r_form_parse(self, elem, f): | |
53 | + for e in elem: | |
54 | + if e.name == "input": | |
55 | + if e.attr("name") == None: | |
56 | + continue | |
57 | + f.append((e.attr("name"), e.attr("value"))) | |
58 | + f.elements.append(e) | |
59 | + elif e.name == "textarea": | |
60 | + if e.attr("name") == None: | |
61 | + continue | |
62 | + t = e.inner_html().encode("utf-8") | |
63 | + t = self.convert_ref(t) | |
64 | + f.append((e.attr("name"), t)) | |
65 | + f.elements.append(e) | |
66 | + elif e.name == "select": | |
67 | + if e.attr("name") == None: | |
68 | + continue | |
69 | + name = e.attr("name") | |
70 | + for opt in e: | |
71 | + if opt.has_attribute("selected"): | |
72 | + f.append((name, opt.attr("value"))) | |
73 | + f.elements.append(opt) | |
74 | + elif e.name == "button": | |
75 | + if e.attr("name") == None: | |
76 | + continue | |
77 | + f.append((e.attr("name"), e.attr("value"))) | |
78 | + f.elements.append(e) | |
79 | + else: | |
80 | + self._r_form_parse(e, f) | |
81 | + | |
125 | 82 | |
126 | - def handle_charref(self, ref): | |
127 | - if self._capture: | |
128 | - self._stack = self._stack + "&#" + ref + ";" | |
83 | + def forms(self): | |
84 | + return self._forms | |
129 | 85 | |
86 | + |
@@ -11,10 +11,10 @@ if __name__ == "__main__": | ||
11 | 11 | |
12 | 12 | f = form_retriver.FormRetriver() |
13 | 13 | f.parse(html) |
14 | - for form in f.forms: | |
15 | - print form.action, form.method, ":" | |
16 | - for name in form: | |
17 | - print name, ":", form[name].value | |
14 | + for form in f.forms(): | |
15 | + print "\n", form.action, ":" | |
16 | + for (key, val) in form: | |
17 | + print key, val | |
18 | 18 | |
19 | 19 | |
20 | 20 |