import urllib import urllib.request import bs4 def download(url): responce = urllib.request.urlopen(url); if responce.getcode() != 200: print('Wrong in Download') return None soup = bs4.BeautifulSoup(responce.read(),'html.parser',from_encoding = 'utf-8') return soup
zhedaodi shi shenme bug??????????
import bs4 import urllib def character(x): if 'a' <= x and x<='z': return False if 'A' <= x and x<='Z': return False if x=='.': return False return True def analyze(soup): F = open('文本输出.txt','a',encoding = 'utf-8') L = soup.find('h1') string = L.get_text() string = string + '\n' F.write(string) A = soup.find('div',id = "content") string = A.get_text() string = string + '\n' F.write(string) F.close() flag = False Clist = soup.find_all('a') for C in Clist: if C.get_text()=="下一章": newurl = C['href'] flag = True break if flag: return newurl else: return None
import downloader import analyzer import time prefix = r'http://www.lingdiankanshu.com/html/0/175/' url = r'75298.html' n = 1 while True: now = time.time() soup = downloader.download(prefix + url) if soup == None: print('Finished or Error') break url = analyzer.analyze(soup) print('Chapter',n,'finished in',time.time()-now,'seconds') if url == None: print('No found next!') break n = n + 1