本帖最后由 mmortalyi 于 2023-12-23 22:40 编辑
需要把header处 xxxxx 替换为真实的用户名密码,记得不能爬太快。 仅仅是学习用,不用于商业用途!
已经兼容python3 需要安装bs4,安装方法 pip install bs4 # coding: utf-8 from __future__ import print_function __author__ = 'bobo' from math import ceil import sys py_version = sys.version_infoimport requests from BeautifulSoup import BeautifulSoup as BS if py_version < (3, 0): reload(sys) sys.setdefaultencoding("utf-8") home_url = 'http://www.qixin.com/search' login_url = "" get_params = { "area.city": 3303, "area.province": 33, "key": '', "page": 1, } header = { "acc": "xxxxxxxx", "pass": "xxxxxxx", "captcha": {"isTrusted": True}, "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Connection": "keep-alive", "Host": "www.qixin.com", "Upgrade-Insecure-Requests": 1, "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:54.0) Gecko/20100101 Firefox/54.0", } def get_page_number(html, EVERY_PAGE=10, bbq=False): """ 获取总页数、计算出页数 :param html: :param EVERY_PAGE: :return: """ if bbq: html = BS(html) shangjia_numbers = html.find("em").text print(shangjia_numbers) print() try: # 为了方便计算页数 shangjia_numbers = float(shangjia_numbers) PAGE_NUMBER = int(ceil(shangjia_numbers / EVERY_PAGE)) except: print("int失败") PAGE_NUMBER = False return PAGE_NUMBER # 传递页数爬取数据 def get_company_info(html, status=u"存续", all=False, file_name=None, bbq=False): """ :param status: 企业状态 存续/注销 :param all: 是否查看所有的企业 :param file_name: 是否写入文件 :return: """ if bbq: html = BS(html) for tag_div in html.findAll(attrs={"class": "col-2-1"}): for span in tag_div.findAll("span"): # if status in span.text or all: if status in span.text: # print("公司存续") company_name = tag_div.find(attrs={"title": u"点击查看公司详情"}).text # print(company_name) for span_address in tag_div.findAll(attrs={"class": "legal-person"}): if u"地址" in span_address.text: company_address = span_address.text[3:][:-4] # print(company_address) if file_name: with open(file_name, 'a') as f: print(company_name, company_address) f.write("{0} {1}\n".format(company_name, company_address)) return def main(KEY_WORD=u"医美", file_name="address.txt", first_html=None): if not first_html: r = requests.post(login_url, data=header) _cookies = r.cookies get_params["key"] = KEY_WORD print(get_params) first_html = requests.get(home_url, params=get_params, cookies=_cookies) first_html = first_html.text print(repr(first_html)) # print(first_html) # sys.exit(0) first_html_content = BS(first_html) # print(first_html_content) get_company_info(first_html_content, file_name=file_name) pages = get_page_number(first_html_content) for page_number in range(2, pages + 1): get_params["page"] = page_number html_content = requests.get(home_url, params=get_params, cookies=_cookies) get_company_info(html_content, file_name=file_name) if __name__ == "__main__": # for word in [u"医美", u"整容", u"整形"]: # main(KEY_WORD=word) main()
|