2
24
2016
0

图片爬虫+模拟登陆+定时爬去——知乎图片爬去

首先是模拟登陆,这个只要输错一次密码就可以得到POST的数据了非常的良心

这个没有headers验证。。

# coding = 'utf-8'
import requests
import bs4

dic = {"_xsrf":"c892d53f291611e18902b1101bae4a8f",
       "email":"495327165@qq.com",
       "password":"your_password",
       "remember_me":"false"}

headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586"}
login_url = r"http://www.zhihu.com/login/email"
def login(s):
    a = s.post(login_url,dic,headers = headers)
    url_test = r"https://www.zhihu.com/people/wang-yuan-wei-38"
    responce = s.get(url_test,headers = headers)
    responce.encoding = 'utf-8'
    soup = bs4.BeautifulSoup(responce.text,'html.parser',from_encoding = 'utf-8')
    
    a = soup.find("a",href = "/people/edit")

    if a != None:
        print("successful login")
        return True
    else:
        print("unsuccessful login")
        return False

if __name__ == "__main__":
    s = requests.Session()
    print(login(s))

 

下载图片,首先我们需要把照片弄放到题目的文件夹下面就需要一个Name和Path

#coding = utf-8
import requests
import bs4
import urllib
import os
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586"}

def download(url,path,name):
    print("downloading(%s,%s,%s)"%(url,path,name))
    cur_path = os.getcwd()
    new_path = os.path.join(cur_path,path)
    if name == None:
        name = url[-10:]
    else:
        if url[-4]=='.':
            name = name+url[-4:]
        if url[-5]=='.':
            name = name+url[-5:]
    if not os.path.isdir(new_path):
        print('创建',new_path)
        os.mkdir(new_path)
    else:
        pass
        #pos = name.find('.')
        #if int(name[:pos])==1:
        #    print("目标已经爬取!")
        #    return False
    responce = urllib.request.urlopen(url)
    file = ''
    
    if path == None:
        file = '/' + name
    else:
        file = path + '/' + name
    
    with open(file,'wb') as code:
        code.write(responce.read())
    return True
#<img width="950" class="origin_image zh-lightbox-thumb" src="https://pic4.zhimg.com/7489cce13a572ec50a6a9725d3bf36bb_b.jpg" data-original="https://pic4.zhimg.com/7489cce13a572ec50a6a9725d3bf36bb_r.jpg" data-rawheight="355" data-rawwidth="950">
sample_url = r"https://pic4.zhimg.com/a577e2473387b3cea5422135fbb8632b_r.jpg"
sample_name = "Pic_Try.jpg"
sample_path = r"为什么你告诉我到底是为什么!"
if __name__ == "__main__":
    download(sample_url,sample_path,sample_name)

 

get一种存贮图片的方式:

 

最最伟大的主程序……不过看起来没有任何意思

#coding:utf-8
import requests
import bs4
import picture
import re
import os
from login import login
from time import sleep
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586"}

url = r"https://www.zhihu.com/"
nextpage_url = r"https://www.zhihu.com/node/QuestionAnswerListV2"
nextpage_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586",
                    "X-Requested-With":"XMLHttpRequest",}
visit = set([]);
def progress():
    print(".")

def make_dir(path):
    path = path.replace('\n','')
    path = path.replace('?','?')
    path = path.replace('\\','')
    path = path.replace('/','')
    path = path.replace('<',' ')
    path = path.replace('>',' ')
    path = path.replace('|',' ')
    path = path.replace('*','#')
    path = path.replace(':',':')
    return path

def find_pic_with_soup(soup,path = None):
    lists = soup.find_all("img",class_ = "origin_image zh-lightbox-thumb lazy")
    path = make_dir(path)
    print("开始抓取问题:",path)
    now = 1
    cur_path = os.getcwd()
    cur_path = os.path.join(cur_path,path)
    number = 0
    if (cur_path[-1] != '\\'):
        cur_path = cur_path + '\\'

    for node in lists:
        now = now + 1
        if node["data-original"] in visit:
            print("已经爬去图片")
            continue
        number = number + 1
        picture.download(node["data-original"],path,str(now))
        
        visit.add(node["data-original"])
        file = open('visit.txt','a')
        file.write(node['data-original']+'\n')
        file.close()
    return number

def vis(path_url):
    count = 0
    number = 0
    progress()
    new_url = url + path_url
    print("爬去问题:",new_url)
    res = requests.get(new_url,headers = headers)
    progress()
    soup = bs4.BeautifulSoup(res.text,'html.parser',from_encoding = 'utf-8')

    res2 = requests.get(url + path_url)
    soup2 = bs4.BeautifulSoup(res2.text,'html.parser',from_encoding = 'utf-8')
    node = soup2.find("h2")
    path = node.get_text()
    count = find_pic_with_soup(soup,path)
    print("总共下载",count,"张图片")

if __name__ == "__main__":
    s = requests.Session()
    if login(s)==False:
        print("Login Failed! Terminated")
    else:
        file = open('visit.txt','r')
        while True:
            pic = file.readline()
            pic = pic.replace('\n','')
            if pic!='' and pic!=None:
                visit.add(pic)
            else:
                break
        while True:
            responce = s.get(url,headers = headers)
            responce.encoding = 'utf-8'
        
            soup = bs4.BeautifulSoup(responce.text,'html.parser',from_encoding = 'utf-8')
            nodes = soup.find_all("a",class_="question_link")
            
            for node in nodes:
                dis_url = node["href"]
                pos = dis_url.find('#')
                cur_url = dis_url[1:pos]
                vis(cur_url)
            print("Waiting for next Scrub")
            sleep(600)    

再加上一个__init__.py就完成了

 

Category: 未分类 | Tags: python 爬虫 | Read Count: 914

登录 *


loading captcha image...
(输入验证码)
or Ctrl+Enter

Host by is-Programmer.com | Power by Chito 1.3.3 beta | Theme: Aeros 2.0 by TheBuckmaker.com