1.数据来源
基于BeautifulSoup对链家进行爬取,以昆明市房价为例,核心代码如下所示:
def pase_page(url, page):
html = craw(url, page)
html = str(html)
if html is not None:
soup = BeautifulSoup(html, 'lxml')
houses = soup.select('.resblock-list-wrapper li')
for j in range(len(houses)):
house = houses[j]
"名字"
recommend_project = house.select('.resblock-name a.name')
recommend_project = [i.get_text() for i in recommend_project]
recommend_project = ' '.join(recommend_project)
# print(recommend_project)
"类型"
house_type = house.select('.resblock-name span.resblock-type')
house_type = [i.get_text() for i in house_type]
house_type = ' '.join(house_type)
# print(house_type)
"销售状态"
sale_status = house.select('.resblock-name span.sale-status')
sale_status = [i.get_text() for i in sale_status]
sale_status = ' '.join(sale_status)
# print(sale_status)
"大地址"
big_address = house.select('.resblock-location span')
big_address = [i.get_text() for i in big_address] #
big_address = ''.join(big_address)
# print(big_address)
"具体地址"
small_address = house.select('.resblock-location a')
small_address = [i.get_text() for i in small_address] #
small_address = ' '.join(small_address)
# print(small_address)
"优势。"
advantage = house.select('.resblock-tag span')
advantage = [i.get_text() for i in advantage] #
advantage = ' '.join(advantage)
# print(advantage)
"均价:多少1平"
average_price = house.select('.resblock-price .main-price .number')
average_price = [i.get_text() for i in average_price]
average_price = ' '.join(average_price)
"总价,单位万"
total_price = house.select('.resblock-price .second')
total_price = [i.get_text() for i in total_price]
total_price = ' '.join(total_price)
information = [recommend_project, house_type, sale_status, big_address, small_address, advantage,
average_price, total_price]
information = np.array(information)
information = information.reshape(-1, 8)
information = pd.DataFrame(information, columns=['名称', '类型', '销售状态', '大地址', '具体地址', '优势', '均价', '总价'])
information.to_csv('昆明房价.csv', mode='a+', index=False, header=False) # mode='a+'追加写入
print('第{0}页存储数据成功'.format(page))
else:
print('解析失败')
2.数据分析
数据分析代码如下,并进行了可视化展示
import jieba
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import matplotlib.font_manager as fm
def read_data():
data = pd.read_csv('昆明房价.csv', encoding='utf-8')
data.columns = ['楼盘', '类型', '销售状态', '大地址', '具体地址', '优势', '单价', '总价']
data.pop('总价')
print('原始数据量:', len(data))
# 缺失值处理
data = data.dropna().reset_index()
print('缺失值处理后的数据量:', len(data))
# 异常数据处理
for i in range(len(data)):
if str(data.at[i, '单价']).isdigit():
pass
else:
if str(data.at[i, '单价']).__contains__('-'):
data = data.drop(labels=i)
print('无效数据处理后的数据量:', len(data))
return data
def draw_bar(x, y, x_label, y_label, title):
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
scale_ls = range(len(x))
plt.bar(scale_ls, y)
plt.ylabel(y_label)
plt.xlabel(x_label)
plt.xticks([index for index in scale_ls], x)
plt.title(title)
plt.savefig(title + '.jpg')
plt.show()
# 获取前5单价的楼盘柱状图
def get_five_unitPrice(data):
res_data = data
res_data = res_data.drop(res_data[res_data.单价 == '价格待定'].index)
# 数据类型转换
res_data[['单价']] = res_data[['单价']].astype(int)
res_data = res_data.sort_values(by='单价', ascending=False).reset_index()
x = []
y = []
for i in range(5):
x.append(res_data.at[i, '楼盘'])
y.append(int(res_data.at[i, '单价']))
print(x)
print(y)
draw_bar(x, y, '楼盘', '单价', '单价最高的5个楼盘')
# 获取销售状态的饼状图
def get_bar(data):
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
labels = ['在售', '待售', '售罄']
zaishouNum = data[data['销售状态'] == '在售'].__len__()
daishouNum = data[data['销售状态'] == '待售'].__len__()
shouqinNum = data[data['销售状态'] == '售罄'].__len__()
sizes = [zaishouNum, daishouNum, shouqinNum]
print(sizes)
print(labels)
plt.pie(sizes,
labels=labels, # 设置饼图标签
# colors=["#d5695d", "#5d8ca8"], # 设置饼图颜色
autopct='%1.1f%%' # 格式化输出百分比
)
plt.title("销售状态百分比饼图") # 设置标题
plt.savefig('销售状态百分比饼图' + '.jpg')
plt.show()
# 获取不同类型房屋的数量柱状图
def get_five_typeNum(data):
x = []
y = []
y.append(len(data[data['类型'] == '住宅']))
y.append(len(data[data['类型'] == '写字楼']))
y.append(len(data[data['类型'] == '别墅']))
y.append(len(data[data['类型'] == '商业']) + len(data[data['类型'] == '商业类']))
y.append(len(data[data['类型'] == '底商']))
# y.append(len(data[data['类型'] == '酒店式公寓']))
x.append('住宅')
x.append('写字楼')
x.append('别墅')
x.append('商业类')
x.append('底商')
# x.append('酒店式公寓')
print('各个类型楼盘数量分别为:')
print(x[0], y[0])
print(x[1], y[1])
print(x[2], y[2])
print(x[3], y[3])
print(x[4], y[4])
# print(x[5], y[5])
draw_bar(x, y, '类型', '数量', '不同类型数量柱状图')
# 优势字段词频统计
def count_youshi(data):
word_count_dict = {}
for s in data['优势']:
s_list = s.split(' ')
for word in s_list:
if word_count_dict.__contains__(word):
word_count_dict[word] = word_count_dict.get(word) + 1
else:
word_count_dict[word] = 1
word_count_dict = sorted(word_count_dict.items(), key=lambda x: x[1], reverse=True)
x = []
y = []
for word_count in word_count_dict:
x.append(word_count[0])
y.append(word_count[1])
x = x[:10]
y = y[:10]
print('优势字段词频统计top10分别为:')
for i in range(len(x)):
print(x[i], y[i])
draw_bar(x, y, '优势词', '词频', '优势词词频柱状图')
# 统计不同类型房屋平均单价
def get_type_mean_unitPrice(data):
# 去除价格待定的
res_daya = data.drop(data[data.单价 == '价格待定'].index)
res_daya[['单价']] = res_daya[['单价']].astype(int)
x = []
y = []
mean_data = res_daya.groupby('类型')['单价'].mean()
print(mean_data)
x.append('住宅')
y.append(mean_data[0])
x.append('写字楼')
y.append(mean_data[1])
x.append('别墅')
y.append(mean_data[2])
x.append('商业类')
y.append((mean_data[3] + mean_data[4]) / 2)
x.append('底商')
y.append(mean_data[5])
x.append('酒店式公寓')
y.append(mean_data[6])
print(x)
print(y)
draw_bar(x, y, '类型', '平均单价', '不同类型平均单价柱状图')
# 最大最小值分析
def getMaxMin(data):
res_data = data.drop(data[data.单价 == '价格待定'].index)
res_data[['单价']] = res_data[['单价']].astype(int)
res_data = res_data.drop(res_data[res_data['单价'] < 1000].index)
print('单价最高的楼盘为:\n', res_data.at[res_data['单价'].idxmax(), '楼盘'], res_data.at[res_data['单价'].idxmax(), '单价'])
print('单价最低的楼盘为:\n', res_data.at[res_data['单价'].idxmin(), '楼盘'], res_data.at[res_data['单价'].idxmin(), '单价'])
# 基于透视表的分析
def pivotTable(data):
# 显示中文的设置
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
res_data = data.drop(data[data.单价 == '价格待定'].index)
res_data[['单价']] = res_data[['单价']].astype(int)
res_data = res_data.drop(res_data[res_data['单价'] < 1000].index)
# 基于pivot_table实现透视表
data_pivot = pd.pivot_table(res_data, values='单价', index=['类型', '销售状态'], aggfunc=np.mean)
data_pivot.plot.barh(figsize=(10, 7))
plt.show()
print(data_pivot)
# 绘制优势字段词云图
def drawWordcloudImg(data):
selectcol = data['优势']
stopwords = {}.fromkeys([' ']) # 定义停用词字典
words = [] # 创建一个空列表存放分词结果
for col in selectcol:
words_fc = jieba.cut(col, cut_all=False) # 使用jieba精准模式进行分词
for j in words_fc: # 对分词结果进行停用词筛选
if j not in stopwords: # 如果不在停用词字典中
words.append(j)
text = ' '.join(words)
wordcloud = WordCloud(
background_color="white", # 背景颜色
max_words=150, # 最多显示的词数
max_font_size=60, # 最大的字符
random_state=42, #
font_path='C:/Windows/Fonts/simkai.ttf' # 中文处理,用系统自带的字体
).generate(text) # generate 只能处理str文本不能处理list文本
# 对词云图各项参数进行调整,使词云图更美观
fm.FontProperties(fname='C:/Windows/Fonts/simkai.ttf') # 词云字体设置
# 绘制图片
plt.imshow(wordcloud)
plt.axis("off") # 为云图去掉坐标轴
# 展示图片
plt.show()
# 保存词云图
wordcloud.to_file('优势词云图.png')
if __name__ == '__main__':
data = read_data()
while True:
select_num = input(
'请选择数据分析功能序号(1 前5单价的楼盘柱状图 2 不同类型房屋的数量柱状图 3 销售状态的饼状图 4 优势字段词频前10展示 5 统计不同类型房屋平均单价 6 房价最高和最低的楼盘 7 透视表分析不同销售状态楼盘情况 8 优势词云图)\n')
if select_num == '1':
get_five_unitPrice(data)
elif select_num == '2':
get_five_typeNum(data)
elif select_num == '3':
get_bar(data)
elif select_num == '4':
count_youshi(data)
elif select_num == '5':
get_type_mean_unitPrice(data)
elif select_num == '6':
getMaxMin(data)
elif select_num == '7':
pivotTable(data)
elif select_num == '8':
drawWordcloudImg(data)
else:
print('输入错误!!!')
3.可视化展示