得到HTML指定ID的内容

大多时候,我们需要得到一个HTML文件中指定的内容,比如得到指定ID的内容,写了个脚本,就实现了这个功能,和大家一起分享一下。

Python脚本

##
# created by gudonghua#gmail.com
# posted in http://www.pythonclub.org @ 2011-12-23
# 
##
 
import re
import os
 
IGNORE_TAGS_START = [
    "<img", "<br", "<!"
    ]
 
IGNORE_TAGS_END = [
    "/>"
    ]
 
 
def get_id_tag(content, id_name):
  id_name = id_name.strip()
  patt_id_tag = """<[^>]*id=['"]?""" + id_name + """['" ][^>]*>"""
  id_tag = re.findall(patt_id_tag, content, re.DOTALL|re.IGNORECASE)
  if id_tag:
    id_tag = id_tag[0]
  return id_tag
 
def find_all_tags(content):
  tag_patt = """<[^>]*>"""
  tags = re.findall(tag_patt, content)
  return tags
 
def get_html_id(content, id_name):
  tag_content = ""
  all_tags = find_all_tags(content)
  id_tag = get_id_tag(content, id_name)
  print "id_tag", id_tag
  tag_stack = []
  if not id_tag:
    return ""
  in_tag = 0
  id_content = ""
  index = 0
  for tag in all_tags:
    if in_tag == 0 and tag == id_tag:
      tag_stack.append(tag)
      start_index = content.find(tag)
      index = start_index + len(tag)
      in_tag = 1
      print "in_tag", tag
    elif in_tag == 1:
      print len(tag_stack), tag_stack[0:2]
      ignore_flag = 0
      for t in IGNORE_TAGS_START:
        if tag.startswith(t):
          ignore_flag = 1; break
      for t in IGNORE_TAGS_END:
        if tag.endswith(t):
          ignore_flag = 1; break
      if ignore_flag:
        continue
 
      if tag.startswith("</"):
        tag_stack.pop()
      else:
        tag_stack.append(tag)
      index = content.find(tag, index)
      index += len(tag)
      if not tag_stack:
        id_content = content[start_index: index]
        break
  return id_content
 
if __name__ == "__main__":
  content = open("ft2.htm").read()
  print get_html_id(content, "bodytext")
python-files/html-get-id.txt · 最后更改: 2011/12/23 13:03 由 admin
2007~2011 Copyright @ http://www.pythonclub.org