Python爬取51Job招聘信息

Python爬取51Job招聘信息

1、准备工作

分析51job招聘信息页面,如图所示,红框标记的是页码,将URL分割为两个部分,爬取多个页面是需要修改页码信息

分析请求响应信息,请求时需要模拟浏览器请求信息,如下图,包括header信息和cookie信息;响应结果在页面的javascript标签中,需要通过正则表达式解析响应结果。

2、用到的Python库

requests:模拟浏览器请求

re:正则表达式

json:字符串转json

pandas:excel操作

3、代码

import json
import re
import time

import requests
import pandas as pd

'''
pre_url: url前缀
suf_url: url后缀
headers: 请求头
cookies: Cookie
page_num: 爬取页数
'''
def get_data(pre_url, suf_url, headers, cookies, page_num):
    for i in range(1, page_num):
        print("爬取第" + str(i) + "页数据")
        url = pre_url + str(i) + suf_url
        web = requests.get(url, headers=headers, cookies=cookies)
        web.encoding = 'gbk'
        print(web.text)
        r = re.findall('window.__SEARCH_RESULT__ = (.*?)</script>', web.text, re.S)
        string = ''.join(r)
        info_dict = json.loads(string)
        job_list = info_dict['engine_jds']
        JobName = []
        Providesalary = []
        Workarea = []
        Attribute = []
        Companyname = []
        Companysize = []
        Companytype = []
        Jobwelf = []
        Companyind = []
        Suedate = []
        for obj in job_list:
            JobName.append(obj['job_name'])
            Providesalary.append(obj['providesalary_text'])
            Workarea.append(obj['workarea_text'])
            Attribute.append(' '.join(obj['attribute_text'][1:]))
            Companyname.append(obj['company_name'])
            Companysize.append(obj['companysize_text'])
            Companytype.append(obj['companytype_text'])
            Jobwelf.append(obj['jobwelf'])
            Companyind.append(obj['companyind_text'])
            Suedate.append(obj['issuedate'])
        data = pd.DataFrame()
        data["工作名称"] = JobName
        data["工资待遇"] = Providesalary
        data["工作地点"] = Workarea
        data["职位要求"] = Attribute
        data["公司名称"] = Companyname
        data["公司规模"] = Companysize
        data["公司类别"] = Companytype
        data["公司福利"] = Jobwelf
        data["主营业务"] = Companyind
        data["发布日期"] = Suedate
        print(data)
        try:
            data.to_csv("51Job乌鲁木齐招聘信息.csv", mode="a+", header=None, index=None, encoding="utf-8")
        except:
            print("跳转网页,无数据")
        time.sleep(1)


if __name__ == '__main__':
    pre_url = "https://search.51job.com/list/310200,000000,0000,00,9,99,+,2,"
    suf_url = ".html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
        'Cache-Control': 'max-age=0',
        'Connection': 'close',
        'Referer': 'https://search.51job.com/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62'
    }
    cookies = {
        "Cookie": "_uab_collina=164515157588672382024854; guid=ffafb018452895c75b5ff63cd2fb9563; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; slife=lastvisit%3D310200%26%7C%26; privacy=1646033921; search=jobarea%7E%60310200%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60310200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60310200%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA-java%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21collapse_expansion%7E%601%7C%21; acw_tc=76b20fe516460406333445179e1863f701fac22d0d83677b10737d6e54a7ab; acw_sc__v2=621c9639e08991e176e0cadd6a8c5f5ea4dabb36; ssxmod_itna=Yq0xcDyD2A0QG=qD=DXm3fSosD78beb1Dg0i=nhnmx0v+xPGzDAxn40iDtrO59hfuDq1YGAPrF3YpSa2tX74aRDb6G4W2D3DU4i8DCL2F4WDemtD5xGoDPxDeDA7KiTDY4DdXxYPG0DiKDpx0kG25D7ZF41lKDTPYDRgaGDQyk9gPmx407DiHq920kD75pDlpxIRYD018f1Av1GRG=qlDDUmR60n2bMbb5xqi36m9Gq40OD0FGXxibG6g6Rav14w+e6QxPDaDPKlbq3iDowDrP=QIxmni5bGiQtYxTmlGx=WKKYmrKDDp4Z4PWGD4D==; ssxmod_itna2=Yq0xcDyD2A0QG=qD=DXm3fSosD78beb1Dg0i=nhnDnKd2qDsKoDLGYhnav7bAi3Fw8MYYm7=w3ifeEjpMIeq8=EdxnRI=ayekyb=8lFkEnXmpQKjjLW=z/=ORRGLUKMRowcU3gBbUOXpQBUZKAKDq33ZUSi4OMi47CE5IjIhQf2M7l03ulie7eiK7CYP3b0k3lfr2CXFoLwZYQpaNWhk57XKYh9hK0G5i7j187j9tQ61U+eZtl3o=3n9lCI88kow8U=4a+0KyOBx0fGD5BOG1KZfNVSxC69Kwz3DkQM79FxD=3zinLzb=QZ16Kjmhx0YPw3ONv1=5Avpa6phdYz6iPvLB3=1pU1Q56d5FwG05=pEhWbiR5TKRQxYW23FUvPzNAPb9Q=8/=0YLjBUa60ezpEErjiTop2DhW==Qtb0bgnQEuv+nFz6r2uS=e9CbT2TUzEWIAHguA82m/=4=RTKnSc2I9uR4uffo9WXZoTv+dFDG2Sr3SG4IRt4FA0htBx4Y2sqQEYl=R2HbxDFqD+oLNQG5D4PEDqBpKnhn0G5AmKAdCEjryxxD==="
    }
    get_data(pre_url, suf_url, headers, cookies, 51)

4、爬取结果

两个鬼故事金姓起名男孩名子坏蛋是怎样炼成www.duote.com阴阳师姑获鸟御魂百度足球适合美甲店起的名字辽宁号排水量筋头巴脑饭店起什么名《蜜桃成熟时33d》yy马甲格式13画的起名吉利字会务公司起名601688资金流向安字起名大全男孩给钻石婚戒起名字从诗经起名字高云娇乒乓球简历恒字起名子男孩李姓起名字给海参店起名字大全首饰店起名吉檀迦利徐国翀隐身衣起名洪宇游戏起名网站莒县起名字哪里好石油科技有限公司起名属鸡起名男孩妖精的尾巴剧场版少年生前被连续抽血16次?多部门介入两大学生合买彩票中奖一人不认账让美丽中国“从细节出发”淀粉肠小王子日销售额涨超10倍高中生被打伤下体休学 邯郸通报单亲妈妈陷入热恋 14岁儿子报警何赛飞追着代拍打雅江山火三名扑火人员牺牲系谣言张家界的山上“长”满了韩国人?男孩8年未见母亲被告知被遗忘中国拥有亿元资产的家庭达13.3万户19岁小伙救下5人后溺亡 多方发声315晚会后胖东来又人满为患了张立群任西安交通大学校长“重生之我在北大当嫡校长”男子被猫抓伤后确诊“猫抓病”测试车高速逃费 小米:已补缴周杰伦一审败诉网易网友洛杉矶偶遇贾玲今日春分倪萍分享减重40斤方法七年后宇文玥被薅头发捞上岸许家印被限制高消费萧美琴窜访捷克 外交部回应联合利华开始重组专访95后高颜值猪保姆胖东来员工每周单休无小长假男子被流浪猫绊倒 投喂者赔24万小米汽车超级工厂正式揭幕黑马情侣提车了西双版纳热带植物园回应蜉蝣大爆发当地回应沈阳致3死车祸车主疑毒驾恒大被罚41.75亿到底怎么缴妈妈回应孩子在校撞护栏坠楼外国人感慨凌晨的中国很安全杨倩无缘巴黎奥运校方回应护栏损坏小学生课间坠楼房客欠租失踪 房东直发愁专家建议不必谈骨泥色变王树国卸任西安交大校长 师生送别手机成瘾是影响睡眠质量重要因素国产伟哥去年销售近13亿阿根廷将发行1万与2万面值的纸币兔狲“狲大娘”因病死亡遭遇山火的松茸之乡“开封王婆”爆火:促成四五十对奥巴马现身唐宁街 黑色着装引猜测考生莫言也上北大硕士复试名单了德国打算提及普京时仅用姓名天水麻辣烫把捣辣椒大爷累坏了

两个鬼故事 XML地图 TXT地图 虚拟主机 SEO 网站制作 网站优化