1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
| import requests from PIL import Image import os from requests.exceptions import ReadTimeout import chardet from bs4 import BeautifulSoup import re import json import time
class GetPpt: def __init__(self, url, savepath): self.url = url self.savepath = savepath if savepath != '' else os.getcwd() self.html = '' self.wkinfo ={} self.ppturls = []
self.getHtml() self.getWkInfo()
def getHtml(self): try: header = {'User-Agent': 'Mozilla/5.0 ' '(Macintosh; Intel Mac OS X 10_14_6) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/78.0.3904.108 Safari/537.36'} response = requests.get(self.url, headers = header) self.transfromEncoding(response) self.html = BeautifulSoup(response.text, 'html.parser') except ReadTimeout as e: print(e) return None
def transfromEncoding(self,html): html.encoding = chardet.detect(html.content).get("encoding")
def getWkInfo(self): items = ["'title'","'docType'","'docId'","'totalPageNum"] for item in items: ls = re.findall(item+".*'", str(self.html)) if len(ls) != 0: message = ls[0].split(':') self.wkinfo[eval(message[0])] = eval(message[1])
def getJson(self, url): """ :param url: json文件所在页面的url :return: json格式字符串 """ response = requests.get(url) jsonstr=response.text[response.text.find('(')+1: response.text.rfind(')')] return jsonstr
def convertJsonToDict(self, jsonstr): """ :param jsonstr: json格式字符串 :return: json字符串所对应的python字典 """ textdict = json.loads(jsonstr) return textdict
def getImageUrlForPPT(self): timestamp = round(time.time()*1000) desturl = "https://wenku.baidu.com/browse/getbcsurl?doc_id="+\ self.wkinfo.get("docId")+\ "&pn=1&rn=99999&type=ppt&callback=jQuery1101000870141751143283_"+\ str(timestamp) + "&_=" + str(timestamp+1)
textdict = self.convertJsonToDict(self.getJson(desturl)) self.ppturls = [x.get('zoom') for x in textdict.get('list')]
def getImage(self, imagename, imageurl): with open(imagename,'wb') as ig: ig.write(requests.get(imageurl).content)
def mergeImageToPDF(self, pages): if pages == 0: raise IOError
namelist = [str(x)+'.png' for x in range(pages)] firstimg = Image.open(namelist[0]) imglist = [] for imgname in namelist[1:]: img = Image.open(imgname) img.load()
if img.mode == 'RGBA': img.mode = 'RGB' imglist.append(img)
savepath = self.savepath+'/'+self.wkinfo.get('title')+'.pdf' firstimg.save(savepath, "PDF", resolution=100.0, save_all=True, append_images=imglist)
def removeImage(self, pages): namelist = [str(x) + '.png' for x in range(pages)] for name in namelist: if os.path.exists(name): os.remove(name)
def getPPT(self): self.getImageUrlForPPT() for page, url in enumerate(self.ppturls): self.getImage(str(page)+'.png', url) self.mergeImageToPDF(len(self.ppturls)) self.removeImage(len(self.ppturls))
if __name__ == '__main__': GetPpt('https://wenku.baidu.com/view/6e6b9f397cd184254b3535b6.html?from=search', '存储路径').getPPT()
|