From 4bf56a24ffac6cd93d71c067122e9ef534b9e07c Mon Sep 17 00:00:00 2001 From: lawtribes <34047348+lawtribes@users.noreply.github.com> Date: Fri, 28 Aug 2020 11:24:43 +0800 Subject: [PATCH 1/2] hotfix:20200828.01 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1.使用try……except抓取网页正文获取、图片下载、HTML转换PDF过程中的错误,并跳过错误继续执行; 2.bs转为string过程中容易出现递归深度问题(暂未研究根源),仅抓取并重设最大递归深度翻番 --- start.py | 93 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 74 insertions(+), 19 deletions(-) diff --git a/start.py b/start.py index eb992d6..3923089 100644 --- a/start.py +++ b/start.py @@ -52,12 +52,19 @@ def DownLoadHtml(url): 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' } requests.packages.urllib3.disable_warnings() - response = requests.get(url,headers = headers,proxies=None,verify=False) - if response.status_code == 200: - htmltxt = response.text #返回的网页正文 - return htmltxt - else: - return None + + #使用try……except抓取错误并跳过,避免中断 + try: + response = requests.get(url,headers = headers,proxies=None,verify=False) + if response.status_code == 200: + htmltxt = response.text #返回的网页正文 + return htmltxt + else: + return None + except Exception as e: + print("\n出现错误,错误如下:"+str(e)) + print("----------------------跳过--------------------") + pass #将图片从远程下载保存到本地 def DownImg(url,savepath): @@ -69,9 +76,17 @@ def DownImg(url,savepath): 'Accept-Language':'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3' } requests.packages.urllib3.disable_warnings() - r = requests.get(url,headers = headers,proxies=None,verify=False) - with open(savepath, 'wb') as f: - f.write(r.content) + + + #使用try……except抓取错误并跳过,避免中断 + try: + r = requests.get(url,headers = headers,proxies=None,verify=False) + with open(savepath, 'wb') as f: + f.write(r.content) + except Exception as e: + print("\n出现错误,错误如下:"+str(e)) + print("----------------------跳过--------------------") + pass #修改网页中图片的src,使图片能正常显示 def ChangeImgSrc(htmltxt,saveimgdir,htmlname): @@ -103,8 +118,23 @@ def ChangeImgSrc(htmltxt,saveimgdir,htmlname): img.attrs["src"] = "" ChangeCssSrc(bs) #修改link标签 ChangeContent(bs) #修改js_content的style,使正文能正常显示 - return str(bs) #将BeautifulSoup对象再转换为字符串,用于保存 - + + #try……except抓取错误并跳过,避免中断 + try: + #bs转为str过程中容易出现问题,暂未研究治本方法 + return str(bs) #将BeautifulSoup对象再转换为字符串,用于保存 + + #出现错误maximum recursion depth exceeded while calling a Python object + except Exception as e: + print("\n出现错误,错误如下:"+str(e)) + error="maximum recursion depth exceeded while calling a Python object" + if str(e)==error: + maximum_value = int(sys.getrecursionlimit()) + sys.setrecursionlimit(2*maximum_value)#最大深度乘以2 + print("最大递归深度已调整为:"+str(2*maximum_value)) + return str(bs) + + def ChangeCssSrc(bs): linkList = bs.findAll("link") for link in linkList: @@ -188,11 +218,13 @@ def DownHtmlMain(jsonDir,saveHtmlDir): if os.path.exists(arthtmlsavepath): print("exists",arthtmlsavepath) continue + arthtmlstr = DownLoadHtml(art.url) + + arthtmlstr = ChangeImgSrc(arthtmlstr,saveImgDir,artname) print("\r",end="") SaveFile(arthtmlsavepath,arthtmlstr) - sleep(3) #防止下载过快被微信屏蔽,间隔3秒下载一篇 #把一个文件夹下的html文件都转为pdf @@ -221,7 +253,21 @@ def PDFDir(htmldir,pdfdir): 注意此处去掉了css(link),如果发现pdf格式乱了可以不去掉css """ [s.extract() for s in bs(["script", "iframe", "link"])] - SaveFile(tmppath, str(bs)) + + #try……except抓取错误并跳过,避免中断 + try: + SaveFile(tmppath, str(bs)) #bs转为str容易出现递归深度问题 + + #出现错误maximum recursion depth exceeded while calling a Python object + except Exception as e: + print("\n出现错误,错误如下:"+str(e)) + error="maximum recursion depth exceeded while calling a Python object" + if str(e)==error: + maximum_value = int(sys.getrecursionlimit()) + sys.setrecursionlimit(2*maximum_value)#最大递归深度乘以2 + print("最大递归深度已调整为:"+str(2*maximum_value)) + SaveFile(tmppath, str(bs)) + #SaveFile(tmppath, str(bs)) PDFOne(tmppath,pdfpath) #把一个Html文件转为pdf @@ -240,11 +286,20 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True): cmdlist.append(" " + pdfpath + " ") cmdstr = exepath + "".join(cmdlist) print(cmdstr) - result = subprocess.check_call(cmdstr, shell=False) - # stdout,stderr = result.communicate() - # result.wait() #等待转换完一个再转下一个 - if removehtml: - os.remove(htmlpath) + + + #执行中出现错误subprocess.CalledProcessError: Command 'wkhtmltopdf.exe + #错误导致退出:Exit with code 1 due to network error: UnknownNetworkError + try: + result = subprocess.check_call(cmdstr, shell=False) + # stdout,stderr = result.communicate() + # result.wait() #等待转换完一个再转下一个 + if removehtml: + os.remove(htmlpath) + except Exception as e: + print("\n出现错误,错误如下:"+str(e)) + print("----------------------跳过--------------------") + pass """ @@ -266,7 +321,7 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True): jsbd = GetJson() saveHtmlDir = jsbd["htmlDir"] jsdir= jsbd["jsonDir"] - DownHtmlMain(jsdir,saveHtmlDir) + DownHtmlMain(jsdir,saveHtmlDir) elif arg == "pdf": jsbd = GetJson() saveHtmlDir = jsbd["htmlDir"] From 63a85b0c6ec233335a757266847be834bf8a6c4b Mon Sep 17 00:00:00 2001 From: lawtribes <34047348+lawtribes@users.noreply.github.com> Date: Fri, 28 Aug 2020 15:16:52 +0800 Subject: [PATCH 2/2] hotfix-20200828-02 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1.使用loguru来记录log信息,方便bug调试 2.package里增加了loguru选项,方便前期依赖安装 --- setupPackage.py | 2 +- start.py | 28 +++++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/setupPackage.py b/setupPackage.py index adf0ded..692defe 100644 --- a/setupPackage.py +++ b/setupPackage.py @@ -3,6 +3,6 @@ #如果从默认源安装比较慢的话直接运行这个文件安装 -lst=["beautifulsoup4","lxml","requests"] +lst=["beautifulsoup4","lxml","requests","loguru"]#增加日志模块loguru for pkg in lst: call("pip install -i https://pypi.douban.com/simple --upgrade " + pkg) \ No newline at end of file diff --git a/start.py b/start.py index 3923089..f365eee 100644 --- a/start.py +++ b/start.py @@ -5,24 +5,34 @@ from bs4 import BeautifulSoup from datetime import datetime,timedelta from time import sleep +from loguru import logger#导入logger + """ 本项目开源地址 https://github.com/LeLe86/vWeChatCrawl 讨论QQ群 703431832 """ +#使用loguru来抓取log相关信息,方便后续修改 +#loguru project:https://github.com/Delgan/loguru +logger.add(sys.stderr, format="{time} {level} {message}", filter="my_module", level="INFO") +logger.add('log.log') + +@logger.catch #保存文件 def SaveFile(fpath,fileContent): with open(fpath, 'w', encoding='utf-8') as f: f.write(fileContent) - + +@logger.catch #读取文件 def ReadFile(filepath): with open(filepath, 'r', encoding='utf-8') as f: all_the_text = f.read() return all_the_text +@logger.catch #时间戳转日期 def Timestamp2Datetime(stampstr): dt = datetime.utcfromtimestamp(stampstr) @@ -30,6 +40,7 @@ def Timestamp2Datetime(stampstr): newtimestr = dt.strftime("%Y%m%d_%H%M%S") return newtimestr +@logger.catch #初始化环境 def GetJson(): jstxt = ReadFile("config.json") @@ -42,6 +53,7 @@ def GetJson(): return jsbd +@logger.catch #下载url网页 def DownLoadHtml(url): #构造请求头 @@ -66,6 +78,7 @@ def DownLoadHtml(url): print("----------------------跳过--------------------") pass +@logger.catch #将图片从远程下载保存到本地 def DownImg(url,savepath): #构造请求头 @@ -88,6 +101,7 @@ def DownImg(url,savepath): print("----------------------跳过--------------------") pass +@logger.catch #修改网页中图片的src,使图片能正常显示 def ChangeImgSrc(htmltxt,saveimgdir,htmlname): bs =BeautifulSoup(htmltxt,"lxml") #由网页源代码生成BeautifulSoup对象,第二个参数固定为lxml @@ -134,7 +148,8 @@ def ChangeImgSrc(htmltxt,saveimgdir,htmlname): print("最大递归深度已调整为:"+str(2*maximum_value)) return str(bs) - + +@logger.catch def ChangeCssSrc(bs): linkList = bs.findAll("link") for link in linkList: @@ -142,7 +157,8 @@ def ChangeCssSrc(bs): if href.startswith("//"): newhref = "http:" + href link.attrs["href"] = newhref - + +@logger.catch def ChangeContent(bs): jscontent = bs.find(id="js_content") if jscontent: @@ -158,6 +174,7 @@ def __init__(self,url,pubdate,idx,title): self.idx = idx self.title = title +@logger.catch #从fiddler保存的json文件中提取文章url等信息 def GetArticleList(jsondir): filelist = os.listdir(jsondir) @@ -197,6 +214,8 @@ def GetArticleList(jsondir): print(len(ArtList),pubdate, idx, title) return ArtList + +@logger.catch def DownHtmlMain(jsonDir,saveHtmlDir): saveHtmlDir = jsbd["htmlDir"] if not os.path.exists(saveHtmlDir): @@ -227,6 +246,7 @@ def DownHtmlMain(jsonDir,saveHtmlDir): SaveFile(arthtmlsavepath,arthtmlstr) sleep(3) #防止下载过快被微信屏蔽,间隔3秒下载一篇 +@logger.catch #把一个文件夹下的html文件都转为pdf def PDFDir(htmldir,pdfdir): if not os.path.exists(pdfdir): @@ -270,6 +290,7 @@ def PDFDir(htmldir,pdfdir): #SaveFile(tmppath, str(bs)) PDFOne(tmppath,pdfpath) +@logger.catch #把一个Html文件转为pdf def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True): if skipExists and os.path.exists(pdfpath): @@ -313,6 +334,7 @@ def PDFOne(htmlpath,pdfpath,skipExists=True,removehtml=True): 运行 python start.py pdf #把下载的html转pdf """ if __name__ == "__main__": + if len(sys.argv)==1: arg = None else: