====== 百度空间相册下载器 python实现 ======

by Gods_巨蚁（zz）

最近学习python中，感觉python确实挺好用

昨晚加今天实现了一个 百度空间 相册下载器

下面开放源代码，作者:Gods_巨蚁，转载注明出处
===== 脚本 =====

<code python>
#coding: UTF-8

import urllib, re, os

__metaclass__ = type	#使用新类

class AntAlbumDownload:
	'''
	用于下载百度空间相册照片
	'''
	#imgarr[len]={purl:"/zhongji/album/item/8177718de7d67312b21bba72.html", psrc:"http://hiphotos.baidu.com/zhongji/abpic/item/8177718de7d67312b21bba72.jpg", 
	#		psize:"300*200 61K", pcmtNum:0, pname:"移动.gif",
	#		pedit:  '' ,
	#		pid:"8177718de7d67312b21bba72",
	#isMobileUp:0,
	#isLocked:0				};
	
	#相册网页信息匹配
	patPage = re.compile(
		r'''
		imgarr\[len\]={purl:"(.*?)",
		.*?
		psrc:"(.*?)",
		.*?
		pname:"(.*?)"
		.*?
		pid:"(.*?)"
		.*?};
		''',
		re.VERBOSE
	)
	#测试相册网页
	pat = re.compile(r'imgarr\[len\]={purl:".*?",.*?psrc:".*?",.*?')
	
	#var Session = {
	#spaceURL: "/zhongji",
	#isHost: false,         // 是否是空间主人
	#isLogin: false,
	#isActive: false,
	#isShowVcode: true,    
	#userName: "饥饿蚂蚁",   // 空间主人用户名
	#userNameEnc:    "%BC%A2%B6%F6%C2%EC%D2%CF", 
	#visitorName:    "",
	#visitorURL: "\/index.html",        // 
	#refer: "http:\/\/hi.baidu.com\/zhongji\/album\/%D7%CA%C1%CF%D6%D0%B5%C4%CD%BC%C6%AC\/index\/2",
	#spaceDomain: 'http://hi.baidu.com',
	#spaceStaticDomain: 'http://hi.bdimg.com',
	#portraitDomain: 'http://tx.bdimg.com',
	#photoDomain: 'http://hiphotos.baidu.com',
	#hiupDomain: 'http://hiup.baidu.com',
	#spToken: 'd3981061a624c51023d46bcdc8336fd4'
	#};
	
	#图片网页信息匹配
	patImage = re.compile(
		r'''
		var\ Session\ =\ {
		.*?
		spaceURL:\ "(.*?)",
		.*?
		userName:\ "(.*?)",	# 空间主人用户名
		.*?
		photoDomain:\ '(.*?)',
		.*?};
		''',
		re.VERBOSE
	)
	#测试图片网页
	pat2 = re.compile(r'''
		var\ Session\ =\ {
		.*?
		spaceURL.*?
		userName:.*?
		photoDomain:
		''',
		re.VERBOSE
	)

	def __init__(self):
		pass
		
	def _getPageText(self, url):
		#获取一个网页的内容，并且替换掉所有换行符
		page = urllib.urlopen(url)
		text = page.read()
		page.close()
		
		#这里很关键，去除换行符		
		text = text.replace('\r\n', ' ')
		text = text.replace('\n', ' ')
		return text

	def setAttr(self, name = '', url = ''):
		if not name:
			self.nameAlbum = raw_input('I will create the album directory, Input the name:')
		else:
			self.nameAlbum = name
				
		if not url:
			self.urlAlbum = raw_input('Input the URL of the album:')
		else:
			self.urlAlbum = url
			
		self.countAnalysisPage = 0
		
		#百度空间特殊性
		#若URL中包含#，则提取#后的URL替换当前URL
		loc = self.urlAlbum.find('#')
		if loc != -1:
			self.urlAlbum = 'http://hi.baidu.com' + self.urlAlbum[loc+1:]
		
		
		#如果已经包含页数信息，特将当前页数改为0
		#若不包含页数信息，加上/index/0
				
		#patFirst = re.compile(
		#	r'''
		#	(.*?)/index/([0-9]+)
		#	''',
		#	re.VERBOSE
		#)
		#urlFirst = patFirst.search(self.urlAlbum)
		#if urlFirst:
		#	self.urlAlbum = urlFirst.group(1) + '/index/0'
		#else:
		#	self.urlAlbum += '/index/0'
		
		#新方法
		self.urlAlbum = self._getIndexPageUrl(0)

		print '解析到相册首页URL为: ', self.urlAlbum
		
		cmd = 'md ' + self.nameAlbum
		os.system(cmd)
		
	
	def analysisImagePage(self, url, imageId):
		'''
		分析相片网页，获取相片实际地址
		'''
		text = self._getPageText(url)
		
		print '分析图片页 当前页:%3d' % self.countAnalysisPage
		self.countAnalysisPage += 1
		
		#if self.pat2.search(text):
		#	print 'analysisImagePage ok'
		
		urlImage = self.patImage.search(text)
		
		return urlImage.group(3) + urlImage.group(1) + '/pic/item/' + imageId +'.jpg'

	
	def _getIndexPageUrl(self, iPage):
		#获得当前相册页码为iPage的网页地址
		patIndexPage = re.compile(
			r'''
			(.*?)/index/([0-9]+)
			''',
			re.VERBOSE
		)
		urlIndexPage = patIndexPage.search(self.urlAlbum)
		
		if urlIndexPage:
			return urlIndexPage.group(1) + '/index/' + str(iPage)
		else:
			return self.urlAlbum + '/index/' + str(iPage)
	
	def analysis(self, countPage):
		'''
		分析相册网页内容
		参数countPage表示分析的页数，0表示分析所有页
		
		'''
		print '开始分析页面'
		
		#存放图片下载地址
		images = []
		
		#表示总页数
		maxPage = 0
		
		#读取相册首页内容
		textPage = self._getPageText(self.urlAlbum)
		
		#<a  href="#/zhongji/album/%D7%CA%C1%CF%D6%D0%B5%C4%CD%BC%C6%AC/index/3">[尾页]</a>	
		patLastPage = re.compile(
			r'''
			<a.+?href="\#.+?/index/([0-9]+?)">\[尾页\]</a>
			''',
			re.VERBOSE
		)
		
		urlLastPage = patLastPage.search(textPage)
		
		if urlLastPage:
			#如果找到 [尾页] 匹配,则可以得到相册最大页数
			print '尾页匹配成功'
			maxPage = int(urlLastPage.group(1))
		else:
			print '尾页匹配失败'
			maxPage = 0
		
		
		#参数countPage为0时，表示分析相册所有页面
		#参数countPage不允许超过总页数
		if countPage == 0 or countPage > maxPage + 1:
			countPage = maxPage + 1
		
		for iPage in xrange(countPage):
			print '分析相册 当前页:%3d' % (iPage)
			if iPage != 0:
				#获得页码为iPage的页码类容
				urlIndexPage = self._getIndexPageUrl(iPage)
				textPage = self._getPageText(urlIndexPage)
			
			for imagePage, imageSmall, imageName, imageId in self.patPage.findall(textPage):
				#print 'href:', imagePage
				#print 'imageSmall:', imageSmall
				#print 'name:', imageName
				#print 'id:', imageId
				#imagePages.append(imagePage)
				
				#原图所在网页URL
				urlImagePage = 'http://hi.baidu.com' + imagePage
				#print 'ulrImage:', urlImagePage
				#print imageId
				
				#获取原图URL
				urlImage = self.analysisImagePage(urlImagePage, imageId)
				#print 'ulrImage:', urlImage
				images.append(urlImage)
		
		print '分析完成，开始下载'
		
		max = len(images)
		
		#显示图片URL，并且下载图片到 已命名文件夹
		for index, image in enumerate(images):
			#设置保存路径
			pathImage = r'%s\%04d.jpg' % (self.nameAlbum, index)
			
			#下载图片
			urllib.urlretrieve(image, pathImage)
			
			print pathImage
			print '下载完成%.1f:%%' % ((index+1)*100.0/max)
		
	
	def download(self, countPage = 0):
		'''
		下载相册图形
		countPage表示页数,0表示下载所有图片
		'''
		self.analysis(countPage);
	
		#for i in xrange(countPage):
			

def main():
	album = AntAlbumDownload()
	album.setAttr('', '')
	album.download(0)

main()
</code>

===== 参考  =====
  * http://www.cnblogs.com/gods/archive/2011/04/20/2022332.html