本帖最后由 along 于 2021-09-19 19:37 编辑
如题,用贝壳网的新房信息做可视化分析 代码编写平台:jupyter notebook 选用数据:贝壳网广州地区的房源信息 第一部分:爬取房源信息代码 这部分代码网上有很多 import random import requests from bs4 import BeautifulSoup from time import sleep import pandas as pd USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", ] #设置广州各区域对应的网页url region = { 'nansha': 'nansha/', 'liwan': 'liwan/', 'yuexiu': 'yuexiu/', 'haizhu': 'haizhu/', 'tianhe': 'tianhe/', 'baiyun': 'baiyun/', 'huangpu': 'huangpugz/', 'panyu': 'panyu/', 'huadou': 'huadou/', 'zengcheng': 'zengcheng/', 'conghua': 'conghua/' } #随机取user-agents headers = {"User-Agent": random.choice(USER_AGENTS)} def spider(regions): '''regions可设置为广州某个地区或者是全部爬取''' main_url = 'https://gz.fang.ke.com/loupan/' for key, values in region.items(): if key == regions: all_list = [] url = main_url + values response = requests.get(url, timeout=10, headers=headers) html = response.content soup = BeautifulSoup(html, "lxml") #通过得到的结果计算页数,每页10个,并进行四舍五入 page = round(int(soup.find('span', class_="value").string) / 10) for i in range(1, page + 1): sleep(1.7) page_url = main_url + values + f'pg{i}' response = requests.get(page_url, timeout=10, headers=headers) html = response.content soup = BeautifulSoup(html, "lxml") #发现网页在到达50多页的时候不会出现数据,防报错 try: house_elements = soup.find_all('li', class_="resblock-list post_ulog_exposure_scroll has-results") pass for house_elem in house_elements: #房价 price = house_elem.find('span', class_="number") #提取是否存在支持vr看房的,支持为1,不支持为0 try: desc = house_elem.find('li', class_="icon vr vr-animation-forever").text if desc == "": have_vr = 1 except Exception as e: have_vr = 0 #总价的阈值 total = house_elem.find('div', class_="second") #楼盘的名称 loupan = house_elem.find('a', class_='name') # 清理数据,去除空白文本和无用的中文计量单位 try: price = price.text.strip() except Exception as e: price = '0' loupan = loupan.text.replace("\n", "") try: total = total.text.strip().replace(u'总价', '') total = total.replace(u'/套起', '').replace('(万/套)', '') except Exception as e: total = '0' #数据装入列表 data = loupan, price, total, have_vr all_list.append(data) except: break #通过pandas保存为csv文件 df = pd.DataFrame(all_list) df.to_csv(f"{key}.csv", index=False,encoding="utf_8_sig") print(f"{key}.csv保存完毕") else: if regions == 'all': all_list = [] url = main_url + values response = requests.get(url, timeout=10, headers=headers) html = response.content soup = BeautifulSoup(html, "lxml") page = round(int(soup.find('span', class_="value").string) / 10) for i in range(1, page + 1): sleep(1.7) page_url = main_url + values + f'pg{i}' response = requests.get(page_url, timeout=10, headers=headers) html = response.content soup = BeautifulSoup(html, "lxml") try: house_elements = soup.find_all('li', class_="resblock-list post_ulog_exposure_scroll has-results") pass for house_elem in house_elements: price = house_elem.find('span', class_="number") try: desc = house_elem.find('li', class_="icon vr vr-animation-forever").text if desc == "": have_vr = 1 except Exception as e: have_vr = 0 total = house_elem.find('div', class_="second") loupan = house_elem.find('a', class_='name') # 继续清理数据 try: price = price.text.strip() except Exception as e: price = '0' loupan = loupan.text.replace("\n", "") try: total = total.text.strip().replace(u'总价', '') total = total.replace(u'/套起', '').replace('(万/套)', '') except Exception as e: total = '0' data = loupan, price, total, have_vr all_list.append(data) except: break df = pd.DataFrame(all_list) df.to_csv(f"{key}.csv", index=False, encoding="utf_8_sig") print(f"{key}.csv保存完毕") spider('all')
第二部分:柱状图
#对比不同区域支持vr看房的情况 import pandas as pd from pyecharts.globals import CurrentConfig, NotebookType from pyecharts import options as opts import os from pyecharts.charts import Bar #设定类型并加载锦泰资源 CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_NOTEBOOK CurrentConfig.ONLINE_HOST="https://assets.pyecharts.org/assets/" #通过遍历文件夹找到各区域的csv文件 file_list = os.listdir() dicts = {} all_count = [] for file in file_list: vr_count = 0 if file.endswith("csv"): df = pd.read_csv(file, encoding='utf-8') #统计支持vr的房源数 list1 = df.values.tolist() for n in list1: if n[3] == 1: vr_count += 1 else: continue #装入字典的同时进行处理key值 dicts[file.replace(".csv", "")] = vr_count the_a = ["白云", "从化", "海珠", "花都", "黄埔", "荔湾", "南沙", "番禺", "天河", "越秀", "增城"] value = [] for key, values in dicts.items(): value.append(int(values)) #添加x,y轴,设置标题 bar = Bar() bar.add_xaxis(the_a) bar.add_yaxis("各区支持VR", value) bar.set_global_opts(title_opts=opts.TitleOpts(title="广州各区VR看房情况")) bar.render_notebook()
第二部分:饼状图#每个区域所有房源房价的均值(元/㎡) import os from pyecharts.charts import Pie file_list = os.listdir() #设置均值列表 avg_count = [] for file in file_list: if file.endswith("csv"): df = pd.read_csv(file, encoding='utf-8') all_counts = df.shape[0] list1 = df.values.tolist() sum_all = 0 #对数据进行细处理,处理掉无用的数据,并在总数上减一 for n in list1: if str(n[1]).isdigit(): sum_all += int(n[1]) else: all_counts -= 1 #得到该地区的均值 avg_ = round(sum_all / all_counts) avg_count.append(avg_) pie = Pie(init_opts=opts.InitOpts(width="600px", height="400px")) #对数据进行打包转换 pie.add("", data_pair=[(i, j)for i, j in zip(the_a, avg_count)]) pie.set_global_opts(title_opts=opts.TitleOpts(title="广州各区域房价均值分布(元/㎡)")) pie.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {d}%")) pie.render_notebook()
第三部分:地区地图#统计广州各区房源的数量分布 import os from pyecharts.charts import Map import pandas as pd from pyecharts import options as opts file_list = os.listdir() num_count = [] for file in file_list: if file.endswith("csv"): df = pd.read_csv(file, encoding='utf-8') num_count.append(df.shape[0]) the_a = ["白云区", "从化区", "海珠区", "花都区", "黄埔区", "荔湾区", "南沙区", "番禺区", "天河区", "越秀区", "增城区"] #以地图方式可视化房源分布状况 maps = (Map() .add("",[list(z) for z in zip(the_a,num_count)],maptype='广州') .set_global_opts(visualmap_opts=opts.VisualMapOpts(max_=200,is_piecewise=True),title_opts=opts.TitleOpts(title="广州各区房源数量分布")) ) maps.render_notebook()
首次用jupyternotebook写分区代码块,跟pycharm开发的方式不同,但相较于可视化来讲还是很方便 文章仅供交流,如有侵权请联系删帖
|