大神论坛

找回密码
快速注册
查看: 209 | 回复: 0

[源码] 原创启信宝爬虫 附源码

主题

帖子

0

积分

初入江湖

UID
675
积分
0
精华
威望
0 点
违规
大神币
68 枚
注册时间
2023-10-14 10:52
发表于 2023-12-23 22:40
本帖最后由 mmortalyi 于 2023-12-23 22:40 编辑

需要把header处 xxxxx 替换为真实的用户名密码,记得不能爬太快。
仅仅是学习用,不用于商业用途!

已经兼容python3
需要安装bs4,安装方法
pip install bs4

# coding: utf-8
from __future__ import print_function
__author__ = 'bobo'
from math import ceil
import sys
py_version = sys.version_infoimport requests
from BeautifulSoup import BeautifulSoup as BS
if py_version < (3, 0):
reload(sys)
sys.setdefaultencoding("utf-8")

home_url = 'http://www.qixin.com/search'
login_url = ""
get_params = {
"area.city": 3303,
"area.province": 33,
"key": '',
"page": 1,
}


header = {
"acc": "xxxxxxxx",
"pass": "xxxxxxx",
"captcha": {"isTrusted": True},

"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Connection": "keep-alive",
"Host": "www.qixin.com",
"Upgrade-Insecure-Requests": 1,
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:54.0) Gecko/20100101 Firefox/54.0",
}


def get_page_number(html, EVERY_PAGE=10, bbq=False):
"""
获取总页数、计算出页数
:param html:
:param EVERY_PAGE:
:return:
"""
if bbq:
html = BS(html)
shangjia_numbers = html.find("em").text
print(shangjia_numbers)
print()
try:
# 为了方便计算页数
shangjia_numbers = float(shangjia_numbers)
PAGE_NUMBER = int(ceil(shangjia_numbers / EVERY_PAGE))
except:
print("int失败")
PAGE_NUMBER = False
return PAGE_NUMBER


# 传递页数爬取数据
def get_company_info(html, status=u"存续", all=False, file_name=None, bbq=False):
"""

:param status: 企业状态 存续/注销
:param all: 是否查看所有的企业
:param file_name: 是否写入文件
:return:
"""
if bbq:
html = BS(html)
for tag_div in html.findAll(attrs={"class": "col-2-1"}):
for span in tag_div.findAll("span"):
# if status in span.text or all:
if status in span.text:
# print("公司存续")
company_name = tag_div.find(attrs={"title": u"点击查看公司详情"}).text
# print(company_name)
for span_address in tag_div.findAll(attrs={"class": "legal-person"}):
if u"地址" in span_address.text:
company_address = span_address.text[3:][:-4]
# print(company_address)
if file_name:
with open(file_name, 'a') as f:
print(company_name, company_address)
f.write("{0} {1}\n".format(company_name, company_address))

return


def main(KEY_WORD=u"医美", file_name="address.txt", first_html=None):
if not first_html:
r = requests.post(login_url, data=header)
_cookies = r.cookies

get_params["key"] = KEY_WORD
print(get_params)
first_html = requests.get(home_url, params=get_params, cookies=_cookies)
first_html = first_html.text
print(repr(first_html))
# print(first_html)
# sys.exit(0)
first_html_content = BS(first_html)
# print(first_html_content)
get_company_info(first_html_content, file_name=file_name)

pages = get_page_number(first_html_content)
for page_number in range(2, pages + 1):
get_params["page"] = page_number
html_content = requests.get(home_url, params=get_params, cookies=_cookies)
get_company_info(html_content, file_name=file_name)


if __name__ == "__main__":
# for word in [u"医美", u"整容", u"整形"]:
# main(KEY_WORD=word)
main()


返回顶部