这里会显示出您选择的修订版和当前版本之间的差别。
— |
python-files:html-get-id [2011/12/23 05:03] (当前版本) |
||
---|---|---|---|
行 1: | 行 1: | ||
+ | ====== 得到HTML指定ID的内容 ====== | ||
+ | |||
+ | 大多时候,我们需要得到一个HTML文件中指定的内容,比如得到指定ID的内容,写了个脚本,就实现了这个功能,和大家一起分享一下。 | ||
+ | |||
+ | |||
+ | |||
+ | ===== Python脚本 ===== | ||
+ | <code python> | ||
+ | ## | ||
+ | # created by gudonghua#gmail.com | ||
+ | # posted in http://www.pythonclub.org @ 2011-12-23 | ||
+ | # | ||
+ | ## | ||
+ | |||
+ | import re | ||
+ | import os | ||
+ | |||
+ | IGNORE_TAGS_START = [ | ||
+ | "<img", "<br", "<!" | ||
+ | ] | ||
+ | |||
+ | IGNORE_TAGS_END = [ | ||
+ | "/>" | ||
+ | ] | ||
+ | |||
+ | |||
+ | def get_id_tag(content, id_name): | ||
+ | id_name = id_name.strip() | ||
+ | patt_id_tag = """<[^>]*id=['"]?""" + id_name + """['" ][^>]*>""" | ||
+ | id_tag = re.findall(patt_id_tag, content, re.DOTALL|re.IGNORECASE) | ||
+ | if id_tag: | ||
+ | id_tag = id_tag[0] | ||
+ | return id_tag | ||
+ | |||
+ | def find_all_tags(content): | ||
+ | tag_patt = """<[^>]*>""" | ||
+ | tags = re.findall(tag_patt, content) | ||
+ | return tags | ||
+ | |||
+ | def get_html_id(content, id_name): | ||
+ | tag_content = "" | ||
+ | all_tags = find_all_tags(content) | ||
+ | id_tag = get_id_tag(content, id_name) | ||
+ | print "id_tag", id_tag | ||
+ | tag_stack = [] | ||
+ | if not id_tag: | ||
+ | return "" | ||
+ | in_tag = 0 | ||
+ | id_content = "" | ||
+ | index = 0 | ||
+ | for tag in all_tags: | ||
+ | if in_tag == 0 and tag == id_tag: | ||
+ | tag_stack.append(tag) | ||
+ | start_index = content.find(tag) | ||
+ | index = start_index + len(tag) | ||
+ | in_tag = 1 | ||
+ | print "in_tag", tag | ||
+ | elif in_tag == 1: | ||
+ | print len(tag_stack), tag_stack[0:2] | ||
+ | ignore_flag = 0 | ||
+ | for t in IGNORE_TAGS_START: | ||
+ | if tag.startswith(t): | ||
+ | ignore_flag = 1; break | ||
+ | for t in IGNORE_TAGS_END: | ||
+ | if tag.endswith(t): | ||
+ | ignore_flag = 1; break | ||
+ | if ignore_flag: | ||
+ | continue | ||
+ | |||
+ | if tag.startswith("</"): | ||
+ | tag_stack.pop() | ||
+ | else: | ||
+ | tag_stack.append(tag) | ||
+ | index = content.find(tag, index) | ||
+ | index += len(tag) | ||
+ | if not tag_stack: | ||
+ | id_content = content[start_index: index] | ||
+ | break | ||
+ | return id_content | ||
+ | |||
+ | if __name__ == "__main__": | ||
+ | content = open("ft2.htm").read() | ||
+ | print get_html_id(content, "bodytext") | ||
+ | </code> |