====== 百度空间相册下载器 python实现 ====== by Gods_巨蚁(zz) 最近学习python中,感觉python确实挺好用 昨晚加今天实现了一个 百度空间 相册下载器 下面开放源代码,作者:Gods_巨蚁,转载注明出处 ===== 脚本 ===== #coding: UTF-8 import urllib, re, os __metaclass__ = type #使用新类 class AntAlbumDownload: ''' 用于下载百度空间相册照片 ''' #imgarr[len]={purl:"/zhongji/album/item/8177718de7d67312b21bba72.html", psrc:"http://hiphotos.baidu.com/zhongji/abpic/item/8177718de7d67312b21bba72.jpg", # psize:"300*200 61K", pcmtNum:0, pname:"移动.gif", # pedit: '' , # pid:"8177718de7d67312b21bba72", #isMobileUp:0, #isLocked:0 }; #相册网页信息匹配 patPage = re.compile( r''' imgarr\[len\]={purl:"(.*?)", .*? psrc:"(.*?)", .*? pname:"(.*?)" .*? pid:"(.*?)" .*?}; ''', re.VERBOSE ) #测试相册网页 pat = re.compile(r'imgarr\[len\]={purl:".*?",.*?psrc:".*?",.*?') #var Session = { #spaceURL: "/zhongji", #isHost: false, // 是否是空间主人 #isLogin: false, #isActive: false, #isShowVcode: true, #userName: "饥饿蚂蚁", // 空间主人用户名 #userNameEnc: "%BC%A2%B6%F6%C2%EC%D2%CF", #visitorName: "", #visitorURL: "\/index.html", // #refer: "http:\/\/hi.baidu.com\/zhongji\/album\/%D7%CA%C1%CF%D6%D0%B5%C4%CD%BC%C6%AC\/index\/2", #spaceDomain: 'http://hi.baidu.com', #spaceStaticDomain: 'http://hi.bdimg.com', #portraitDomain: 'http://tx.bdimg.com', #photoDomain: 'http://hiphotos.baidu.com', #hiupDomain: 'http://hiup.baidu.com', #spToken: 'd3981061a624c51023d46bcdc8336fd4' #}; #图片网页信息匹配 patImage = re.compile( r''' var\ Session\ =\ { .*? spaceURL:\ "(.*?)", .*? userName:\ "(.*?)", # 空间主人用户名 .*? photoDomain:\ '(.*?)', .*?}; ''', re.VERBOSE ) #测试图片网页 pat2 = re.compile(r''' var\ Session\ =\ { .*? spaceURL.*? userName:.*? photoDomain: ''', re.VERBOSE ) def __init__(self): pass def _getPageText(self, url): #获取一个网页的内容,并且替换掉所有换行符 page = urllib.urlopen(url) text = page.read() page.close() #这里很关键,去除换行符 text = text.replace('\r\n', ' ') text = text.replace('\n', ' ') return text def setAttr(self, name = '', url = ''): if not name: self.nameAlbum = raw_input('I will create the album directory, Input the name:') else: self.nameAlbum = name if not url: self.urlAlbum = raw_input('Input the URL of the album:') else: self.urlAlbum = url self.countAnalysisPage = 0 #百度空间特殊性 #若URL中包含#,则提取#后的URL替换当前URL loc = self.urlAlbum.find('#') if loc != -1: self.urlAlbum = 'http://hi.baidu.com' + self.urlAlbum[loc+1:] #如果已经包含页数信息,特将当前页数改为0 #若不包含页数信息,加上/index/0 #patFirst = re.compile( # r''' # (.*?)/index/([0-9]+) # ''', # re.VERBOSE #) #urlFirst = patFirst.search(self.urlAlbum) #if urlFirst: # self.urlAlbum = urlFirst.group(1) + '/index/0' #else: # self.urlAlbum += '/index/0' #新方法 self.urlAlbum = self._getIndexPageUrl(0) print '解析到相册首页URL为: ', self.urlAlbum cmd = 'md ' + self.nameAlbum os.system(cmd) def analysisImagePage(self, url, imageId): ''' 分析相片网页,获取相片实际地址 ''' text = self._getPageText(url) print '分析图片页 当前页:%3d' % self.countAnalysisPage self.countAnalysisPage += 1 #if self.pat2.search(text): # print 'analysisImagePage ok' urlImage = self.patImage.search(text) return urlImage.group(3) + urlImage.group(1) + '/pic/item/' + imageId +'.jpg' def _getIndexPageUrl(self, iPage): #获得当前相册页码为iPage的网页地址 patIndexPage = re.compile( r''' (.*?)/index/([0-9]+) ''', re.VERBOSE ) urlIndexPage = patIndexPage.search(self.urlAlbum) if urlIndexPage: return urlIndexPage.group(1) + '/index/' + str(iPage) else: return self.urlAlbum + '/index/' + str(iPage) def analysis(self, countPage): ''' 分析相册网页内容 参数countPage表示分析的页数,0表示分析所有页 ''' print '开始分析页面' #存放图片下载地址 images = [] #表示总页数 maxPage = 0 #读取相册首页内容 textPage = self._getPageText(self.urlAlbum) #[尾页] patLastPage = re.compile( r''' \[尾页\] ''', re.VERBOSE ) urlLastPage = patLastPage.search(textPage) if urlLastPage: #如果找到 [尾页] 匹配,则可以得到相册最大页数 print '尾页匹配成功' maxPage = int(urlLastPage.group(1)) else: print '尾页匹配失败' maxPage = 0 #参数countPage为0时,表示分析相册所有页面 #参数countPage不允许超过总页数 if countPage == 0 or countPage > maxPage + 1: countPage = maxPage + 1 for iPage in xrange(countPage): print '分析相册 当前页:%3d' % (iPage) if iPage != 0: #获得页码为iPage的页码类容 urlIndexPage = self._getIndexPageUrl(iPage) textPage = self._getPageText(urlIndexPage) for imagePage, imageSmall, imageName, imageId in self.patPage.findall(textPage): #print 'href:', imagePage #print 'imageSmall:', imageSmall #print 'name:', imageName #print 'id:', imageId #imagePages.append(imagePage) #原图所在网页URL urlImagePage = 'http://hi.baidu.com' + imagePage #print 'ulrImage:', urlImagePage #print imageId #获取原图URL urlImage = self.analysisImagePage(urlImagePage, imageId) #print 'ulrImage:', urlImage images.append(urlImage) print '分析完成,开始下载' max = len(images) #显示图片URL,并且下载图片到 已命名文件夹 for index, image in enumerate(images): #设置保存路径 pathImage = r'%s\%04d.jpg' % (self.nameAlbum, index) #下载图片 urllib.urlretrieve(image, pathImage) print pathImage print '下载完成%.1f:%%' % ((index+1)*100.0/max) def download(self, countPage = 0): ''' 下载相册图形 countPage表示页数,0表示下载所有图片 ''' self.analysis(countPage); #for i in xrange(countPage): def main(): album = AntAlbumDownload() album.setAttr('', '') album.download(0) main() ===== 参考 ===== * http://www.cnblogs.com/gods/archive/2011/04/20/2022332.html