用Python写爬虫是很方便的,最近看了xlzd.me的文章,他的文章写的很到位,提供了很好的思路。因为他的文章部分代码省略了。下面是基于他的文章的三个代码片段:
基于Python3,Python2的话需要修改下input输入函数和print的用法。代码地址
爬取豆瓣电影top250
爬取拉勾网职位信息
模拟登陆知乎
为什么没人给我点赞。?!
有些代码做了更改。其中把获取的数据存储到excel中。关于存取数据到excel可以看我的另一篇文章:。
用到的库
requests
Beautiful Soup
openpyxl
#!/usr/bin/env python
# encoding=utf-8
import requests,re
import codecs
from bs4 import BeautifulSoup
from openpyxl import Workbook
wb = Workbook()
dest_filename = \'电影.xlsx\'
ws1 = wb.active
ws1.title = \"电影top250\"
DOWNLOAD_URL = \'http://movie.douban.com/top250/\'
def download_page(url):
\"\"\"获取url地址页面内容\"\"\"
headers = {
\'User-Agent\': \'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36\'
}
data = requests.get(url, headers=headers).content
return data
def get_li(doc):
soup = BeautifulSoup(doc, \'html.parser\')
ol = soup.find(\'ol\', class_=\'grid_view\')
name = [] #名字
star_con = [] #评价人数
score = [] #评分
info_list = [] #短评
for i in ol.find_all(\'li\'):
detail = i.find(\'div\', attrs={\'class\': \'hd\'})
movie_name = detail.find(\'span\', attrs={\'class\': \'title\'}).get_text() #电影名字
level_star = i.find(\'span\',attrs={\'class\':\'rating_num\'}).get_text() #评分
star = i.find(\'div\',attrs={\'class\':\'star\'})
star_num = star.find(text=re.compile(\'评价\')) #评价
info = i.find(\'span\',attrs={\'class\':\'inq\'}) #短评
if info: #判断是否有短评
info_list.append(info.get_text())
else:
info_list.append(\'无\')
score.append(level_star)
name.append(movie_name)
star_con.append(star_num)
page = soup.find(\'span\', attrs={\'class\': \'next\'}).find(\'a\') #获取下一页
if page:
return name,star_con,score,info_list,DOWNLOAD_URL + page[\'href\']
return name,star_con,score,info_list,None
def main():
url = DOWNLOAD_URL
name = []
star_con=[]
score = []
info = []
while url:
doc = download_page(url)
movie,star,level_num,info_list,url = get_li(doc)
name = name + movie
star_con = star_con + star
score = score+level_num
info = info+ info_list
for (i,m,o,p) in zip(name,star_con,score,info):
col_A = \'A%s\'%(name.index(i)+1)
col_B = \'B%s\'%(name.index(i)+1)
col_C = \'C%s\'%(name.index(i)+1)
col_D = \'D%s\'%(name.index(i)+1)
ws1[col_A]=i
ws1[col_B] = m
ws1[col_C] = o
ws1[col_D] = p
wb.save(filename=dest_filename)
if __name__ == \'__main__\':
main()
结果如下:
职位信息存储在json中,获取到json对象,再从中遍历出公司名、地址、待遇等信息。
import requests
from openpyxl import Workbook
def get_json(url, page, lang_name):
data = {\'first\': \'true\', \'pn\': page, \'kd\': lang_name}
json = requests.post(url, data).json()
list_con = json[\'content\'][\'positionResult\'][\'result\']
info_list = []
for i in list_con:
info = []
info.append(i[\'companyShortName\'])
info.append(i[\'companyName\'])
info.append(i[\'salary\'])
info.append(i[\'city\'])
info.append(i[\'education\'])
info_list.append(info)
return info_list
def main():
lang_name = input(\'职位名:\')
page = 1
url = \'http://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false\'
info_result = []
while page < 31:
info = get_json(url, page, lang_name)
info_result = info_result + info
page += 1
wb = Workbook()
ws1 = wb.active
ws1.title = lang_name
for row in info_result:
ws1.append(row)
wb.save(\'职位信息.xlsx\')
if __name__ == \'__main__\':
main()
运行结果:
通过开发者工具,获取post的数据。
import requests,time
from bs4 import BeautifulSoup
def get_captcha(data):
with open(\'captcha.gif\',\'wb\') as fp:
fp.write(data)
return input(\'输入验证码:\')
def login(username,password,oncaptcha):
sessiona = requests.Session()
headers = {\'User-Agent\':\'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0\'}
_xsrf = BeautifulSoup(sessiona.get(\'https://www.zhihu.com/#signin\',headers=headers).content,\'html.parser\').find(\'input\',attrs={\'name\':\'_xsrf\'}).get(\'value\')
captcha_content = sessiona.get(\'https://www.zhihu.com/captcha.gif?r=%d&type=login\'%(time.time()*1000),headers=headers).content
data = {
\"_xsrf\":_xsrf,
\"email\":username,
\"password\":password,
\"remember_me\":True,
\"captcha\":oncaptcha(captcha_content)
}
resp = sessiona.post(\'https://www.zhihu.com/login/email\',data,headers=headers).content
print(resp)
return resp
if __name__ == \"__main__\":
login(\'your_email\',\'your_password\',get_captcha)
运行后会在运行目录下得到验证码图片:
输入验证码后得到如下响应结果表明登录成功。