我的第一个小爬虫

1
25
2016

import urllib
import urllib.request
import bs4

def download(url):
    responce = urllib.request.urlopen(url);
    if responce.getcode() != 200:
        print('Wrong in Download')
        return None
    soup = bs4.BeautifulSoup(responce.read(),'html.parser',from_encoding = 'utf-8')
    return soup

zhedaodi shi shenme bug??????????

import bs4
import urllib

def character(x):
    if 'a' <= x and x<='z':
        return False
    if 'A' <= x and x<='Z':
        return False
    if x=='.':
        return False
    return True

def analyze(soup):
    F = open('文本输出.txt','a',encoding = 'utf-8')

    L = soup.find('h1')
    string = L.get_text()
    string = string + '\n'
    F.write(string)
    A = soup.find('div',id = "content")
    string = A.get_text()
    string = string + '\n'
    F.write(string)
    F.close()
    flag = False
    Clist = soup.find_all('a')
    for C in Clist:
        if C.get_text()=="下一章":
            newurl = C['href']
            flag = True
            break
    if flag:
        return newurl
    else:
        return None

import downloader
import analyzer
import time
prefix = r'http://www.lingdiankanshu.com/html/0/175/'
url = r'75298.html'
n = 1
while True:
    now = time.time()
    soup = downloader.download(prefix + url)
    if soup == None:
        print('Finished or Error')
        break
    url = analyzer.analyze(soup)
    print('Chapter',n,'finished in',time.time()-now,'seconds')
    if url == None:
        print('No found next!')
        break
    n = n + 1

Category: 未分类 | Tags: | Read Count: 790

davidwang's Blog

Happy coding

我的第一个小爬虫

分类

最新评论

最新留言

链接

RSS

功能