1
25
2016
0

我的第一个小爬虫

import urllib
import urllib.request
import bs4

def download(url):
    responce = urllib.request.urlopen(url);
    if responce.getcode() != 200:
        print('Wrong in Download')
        return None
    soup = bs4.BeautifulSoup(responce.read(),'html.parser',from_encoding = 'utf-8')
    return soup


zhedaodi shi shenme bug??????????​

import bs4
import urllib

def character(x):
    if 'a' <= x and x<='z':
        return False
    if 'A' <= x and x<='Z':
        return False
    if x=='.':
        return False
    return True

def analyze(soup):
    F = open('文本输出.txt','a',encoding = 'utf-8')

    L = soup.find('h1')
    string = L.get_text()
    string = string + '\n'
    F.write(string)
    A = soup.find('div',id = "content")
    string = A.get_text()
    string = string + '\n'
    F.write(string)
    F.close()
    flag = False
    Clist = soup.find_all('a')
    for C in Clist:
        if C.get_text()=="下一章":
            newurl = C['href']
            flag = True
            break
    if flag:
        return newurl
    else:
        return None

    

 

import downloader
import analyzer
import time
prefix = r'http://www.lingdiankanshu.com/html/0/175/'
url = r'75298.html'
n = 1
while True:
    now = time.time()
    soup = downloader.download(prefix + url)
    if soup == None:
        print('Finished or Error')
        break
    url = analyzer.analyze(soup)
    print('Chapter',n,'finished in',time.time()-now,'seconds')
    if url == None:
        print('No found next!')
        break
    n = n + 1
    

 

Category: 未分类 | Tags: | Read Count: 401

登录 *


loading captcha image...
(输入验证码)
or Ctrl+Enter

Host by is-Programmer.com | Power by Chito 1.3.3 beta | Theme: Aeros 2.0 by TheBuckmaker.com