今天看到一个有趣的东西,爬虫抓取拉勾网职位信息,特地实验了一番。
大体思路是这样的:
1、用chrome开发者工具分析表单提交的url、表单的数据(见:)
2、模拟数据直接向url发起请求
3、返回的数据写入excel
import requestsimport jsonfrom openpyxl import Workbookdef get_json(url, page, job_name): form = {'first': 'false', 'pn': page, 'kd': job_name} r = requests.post(url, data=form) # 可能因为是表单,所以不能用json.dumps处理;用了json.dumps结果就不一样了 res = r.json() list_job = res['content']['positionResult']['result'] info_list = [] for i in list_job: info = [] info.append(i['companyShortName']) info.append(i['companyName']) info.append(i['workYear']) info.append(i['salary']) info.append(i['city']) info.append(i['education']) info_list.append(info) return info_listdef main(): job_name = raw_input("job:").encode('utf-8') # 中文处理 page = 1 url = "http://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false" result = [] while page < 10: info = get_json(url, page, job_name.decode('utf-8')) result += info page +=1 wb = Workbook() ws1 = wb.active ws1.title = job_name.decode('utf-8') for row in result: ws1.append(row) wb.save("job.xlsx")if __name__ == '__main__': main()
参考链接:
(python requests库使用教程)