akdien 2012-08-06
代码如下:
COPY SWFToImage.dll %windir%\system32 regsvr32 %windir%\system32\SWFToImage.dll
代码如下:
#用python下载百度文库的代码,需要的同志请修改,下面有提示 #http://www.cnblogs.com/dearplain/ #code by plain import urllib2 import win32com.client import os import sys if __name__=='__main__': #os.system(''); os.chdir('D:\my project\pywenku')#保存到哪个文件夹 SWFToImage=win32com.client.Dispatch("{479A1AAC-C148-40BB-9868-A9773DA66AF9}"); ''' allfile=os.listdir(".") findrecord=0 for file in allfile: if file==".record": record=open(file,'rw') findrecord=1 break if findrecord==0: record=open('.record','w') ''' #url="http://wenku.baidu.com/view/8d3ed840be1e650e52ea9938.html?from=rec&pos=1&weight=2&lastweight=2&count=5" #url="http://wenku.baidu.com/view/f2fe7a3987c24028915fc37a.html?from=related&hasrec=1" #url就是你要下载的文档的地址 url=sys.argv[1] if url.find("http://")!=0: print "error! the url is not correct" sys.exit() print "downloading %s"%url try: urlReferer=url[url.index('http'):url.index('/v')] print urlReferer #urlbody=url[url.index('/v')-1:] urlnum=url[url.index('ew/')+3:url.index('.htm')] except ValueError: print "parse url error" sys.exit() #print urlnum wenku='wenku.baidu.com' reurl='/play/' pagefrom='?pn=' downnum='&rn=' #try to get title and make dir req=urllib2.Request(url) res=urllib2.urlopen(req) data=res.read() try: sfrom=data.index('<title>')+len('<title>') #print sfrom sbefore=sfrom+data[sfrom:].index('</title>') #print sbefore title=data[sfrom:sbefore] title=title[:title.rindex('_')] print 'downloading '+title except ValueError: print "get title error" sys.exit() allfile=os.listdir(".") if (title in allfile)==False: os.mkdir(title) os.chdir('./'+title) #get the first swf req=urllib2.Request('http://wenku.baidu.com'+reurl+urlnum+pagefrom+'1'+downnum+'1') req.add_header("Referer", urlReferer ) res=urllib2.urlopen(req) data=res.read() res.close() head=data[0:45] pagenum=0 sfrom=head.index('\":\"')+len('\":\"') sbefore=sfrom+head[sfrom:].index('\"') pagenum=int(head[sfrom:sbefore]) print 'pagenum:'+str(pagenum) if pagenum<=0 or pagenum>2000: print "error!!!pagenum<0 or pagenum>2000" sys.exit() data=data[106:] swf=open("1.pywenku",'wb') swf.write(data) swf.close() i=1 SWFToImage.InputSWFFileName="%d.pywenku"%i SWFToImage.ImageOutputType = 1 SWFToImage.ImageWidth=1048 SWFToImage.ImageHeight=1478 SWFToImage.Execute_Begin() SWFToImage.FrameIndex = 1 SWFToImage.Execute_GetImage() SWFToImage.SaveToFile("%d.jpg"%i) SWFToImage.Execute_End() os.rename("%d.pywenku"%i,"%d.swf"%i) allfile=os.listdir(".") #从第二页下到最后一页 for i in range(2,pagenum+1): if '%d.swf'%i in allfile: continue #not find in the dir mean req=urllib2.Request('http://wenku.baidu.com'+reurl+urlnum+pagefrom+str(i)+downnum+'1') res=urllib2.urlopen(req) data=res.read() data=data[106:] swf=open("%d.pywenku"%i,'wb') swf.write(data) swf.close() SWFToImage.InputSWFFileName="%d.pywenku"%i SWFToImage.ImageOutputType = 1 SWFToImage.Execute_Begin() SWFToImage.FrameIndex = 1 SWFToImage.Execute_GetImage() SWFToImage.SaveToFile("%d.jpg"%i) SWFToImage.Execute_End() os.rename("%d.pywenku"%i,"%d.swf"%i) res.close() print 'task complete'