python-files:html-get-id

差别

这里会显示出您选择的修订版和当前版本之间的差别。

到此差别页面的链接

@@ 行 1: / 行 1: @@
+====== 得到HTML指定ID的内容 ======
+大多时候，我们需要得到一个HTML文件中指定的内容，比如得到指定ID的内容，写了个脚本，就实现了这个功能，和大家一起分享一下。
+===== Python脚本 =====
+<code python>
+##
+# created by gudonghua#gmail.com
+# posted in http://www.pythonclub.org @ 2011-12-23
+#
+##
+import re
+import os
+IGNORE_TAGS_START = [
+    "<img", "<br", "<!"
+    ]
+IGNORE_TAGS_END = [
+    "/>"
+    ]
+def get_id_tag(content, id_name):
+  id_name = id_name.strip()
+  patt_id_tag = """<[^>]*id=['"]?""" + id_name + """['" ][^>]*>"""
+  id_tag = re.findall(patt_id_tag, content, re.DOTALL|re.IGNORECASE)
+  if id_tag:
+    id_tag = id_tag[0]
+  return id_tag
+def find_all_tags(content):
+  tag_patt = """<[^>]*>"""
+  tags = re.findall(tag_patt, content)
+  return tags
+def get_html_id(content, id_name):
+  tag_content = ""
+  all_tags = find_all_tags(content)
+  id_tag = get_id_tag(content, id_name)
+  print "id_tag", id_tag
+  tag_stack = []
+  if not id_tag:
+    return ""
+  in_tag = 0
+  id_content = ""
+  index = 0
+  for tag in all_tags:
+    if in_tag == 0 and tag == id_tag:
+      tag_stack.append(tag)
+      start_index = content.find(tag)
+      index = start_index + len(tag)
+      in_tag = 1
+      print "in_tag", tag
+    elif in_tag == 1:
+      print len(tag_stack), tag_stack[0:2]
+      ignore_flag = 0
+      for t in IGNORE_TAGS_START:
+        if tag.startswith(t):
+          ignore_flag = 1; break
+      for t in IGNORE_TAGS_END:
+        if tag.endswith(t):
+          ignore_flag = 1; break
+      if ignore_flag:
+        continue
+      if tag.startswith("</"):
+        tag_stack.pop()
+      else:
+        tag_stack.append(tag)
+      index = content.find(tag, index)
+      index += len(tag)
+      if not tag_stack:
+        id_content = content[start_index: index]
+        break
+  return id_content
+if __name__ == "__main__":
+  content = open("ft2.htm").read()
+  print get_html_id(content, "bodytext")
+</code>

python-files/html-get-id.txt · 最后更改: 2011/12/23 05:03 (外部编辑)