首先是模拟登陆,这个只要输错一次密码就可以得到POST的数据了非常的良心
这个没有headers验证。。
# coding = 'utf-8' import requests import bs4 dic = {"_xsrf":"c892d53f291611e18902b1101bae4a8f", "email":"495327165@qq.com", "password":"your_password", "remember_me":"false"} headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586"} login_url = r"http://www.zhihu.com/login/email" def login(s): a = s.post(login_url,dic,headers = headers) url_test = r"https://www.zhihu.com/people/wang-yuan-wei-38" responce = s.get(url_test,headers = headers) responce.encoding = 'utf-8' soup = bs4.BeautifulSoup(responce.text,'html.parser',from_encoding = 'utf-8') a = soup.find("a",href = "/people/edit") if a != None: print("successful login") return True else: print("unsuccessful login") return False if __name__ == "__main__": s = requests.Session() print(login(s))
下载图片,首先我们需要把照片弄放到题目的文件夹下面就需要一个Name和Path
#coding = utf-8 import requests import bs4 import urllib import os headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586"} def download(url,path,name): print("downloading(%s,%s,%s)"%(url,path,name)) cur_path = os.getcwd() new_path = os.path.join(cur_path,path) if name == None: name = url[-10:] else: if url[-4]=='.': name = name+url[-4:] if url[-5]=='.': name = name+url[-5:] if not os.path.isdir(new_path): print('创建',new_path) os.mkdir(new_path) else: pass #pos = name.find('.') #if int(name[:pos])==1: # print("目标已经爬取!") # return False responce = urllib.request.urlopen(url) file = '' if path == None: file = '/' + name else: file = path + '/' + name with open(file,'wb') as code: code.write(responce.read()) return True #<img width="950" class="origin_image zh-lightbox-thumb" src="https://pic4.zhimg.com/7489cce13a572ec50a6a9725d3bf36bb_b.jpg" data-original="https://pic4.zhimg.com/7489cce13a572ec50a6a9725d3bf36bb_r.jpg" data-rawheight="355" data-rawwidth="950"> sample_url = r"https://pic4.zhimg.com/a577e2473387b3cea5422135fbb8632b_r.jpg" sample_name = "Pic_Try.jpg" sample_path = r"为什么你告诉我到底是为什么!" if __name__ == "__main__": download(sample_url,sample_path,sample_name)
get一种存贮图片的方式:
最最伟大的主程序……不过看起来没有任何意思
#coding:utf-8 import requests import bs4 import picture import re import os from login import login from time import sleep headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586"} url = r"https://www.zhihu.com/" nextpage_url = r"https://www.zhihu.com/node/QuestionAnswerListV2" nextpage_headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586", "X-Requested-With":"XMLHttpRequest",} visit = set([]); def progress(): print(".") def make_dir(path): path = path.replace('\n','') path = path.replace('?','?') path = path.replace('\\','') path = path.replace('/','') path = path.replace('<',' ') path = path.replace('>',' ') path = path.replace('|',' ') path = path.replace('*','#') path = path.replace(':',':') return path def find_pic_with_soup(soup,path = None): lists = soup.find_all("img",class_ = "origin_image zh-lightbox-thumb lazy") path = make_dir(path) print("开始抓取问题:",path) now = 1 cur_path = os.getcwd() cur_path = os.path.join(cur_path,path) number = 0 if (cur_path[-1] != '\\'): cur_path = cur_path + '\\' for node in lists: now = now + 1 if node["data-original"] in visit: print("已经爬去图片") continue number = number + 1 picture.download(node["data-original"],path,str(now)) visit.add(node["data-original"]) file = open('visit.txt','a') file.write(node['data-original']+'\n') file.close() return number def vis(path_url): count = 0 number = 0 progress() new_url = url + path_url print("爬去问题:",new_url) res = requests.get(new_url,headers = headers) progress() soup = bs4.BeautifulSoup(res.text,'html.parser',from_encoding = 'utf-8') res2 = requests.get(url + path_url) soup2 = bs4.BeautifulSoup(res2.text,'html.parser',from_encoding = 'utf-8') node = soup2.find("h2") path = node.get_text() count = find_pic_with_soup(soup,path) print("总共下载",count,"张图片") if __name__ == "__main__": s = requests.Session() if login(s)==False: print("Login Failed! Terminated") else: file = open('visit.txt','r') while True: pic = file.readline() pic = pic.replace('\n','') if pic!='' and pic!=None: visit.add(pic) else: break while True: responce = s.get(url,headers = headers) responce.encoding = 'utf-8' soup = bs4.BeautifulSoup(responce.text,'html.parser',from_encoding = 'utf-8') nodes = soup.find_all("a",class_="question_link") for node in nodes: dis_url = node["href"] pos = dis_url.find('#') cur_url = dis_url[1:pos] vis(cur_url) print("Waiting for next Scrub") sleep(600)
再加上一个__init__.py就完成了