Python爬取微博数据
一、需求说明
抓取某一用户的新浪微博数据,抓取的内容包括:id号、微博发布的时间、正文(仅提取文字)、转发数、评论数、点赞数。
在返回的json数据结构中,我们需要的是以下字段:
- data
- cards
- mblog
- id #id号
- created_at # 发布时间
- text # 正文
- reposts_count # 转发数
- comments_count # 评论数
- attitudes_count # 点赞数
二、代码实现
确保安装了requests和pyquery库。
- pip install pyquery
- pip install requests
1)爬取微博数据写入weibo.txt文本中,具体代码解析:
- import requests
- from urllib.parse import urlencode
- from pyquery import PyQuery as pq
-
- host = 'm.weibo.cn'
- base_url = 'https://%s/api/container/getIndex?' % host
- user_agent = 'User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1 wechatdevtools/0.7.0 MicroMessenger/6.3.9 Language/zh_CN webview/0'
-
- headers = {
- 'Host': host,
- 'Referer': 'https://m.weibo.cn/u/XXX',
- 'User-Agent': user_agent
- }
-
-
- # 按页数抓取数据
- def get_single_page(page):
- params = {
- 'type': 'uid',
- 'value': XXX,
- 'containerid': YYY,
- 'page': page
- }
- url = base_url + urlencode(params)
- try:
- response = requests.get(url, headers=headers)
- if response.status_code == 200:
- return response.json()
- except requests.ConnectionError as e:
- print('抓取错误', e.args)
-
-
- # 解析页面返回的json数据
- def parse_page(json):
- items = json.get('data').get('cards')
- for item in items:
- item = item.get('mblog')
- if item:
- data = {
- 'id': item.get('id'),
- 'created_at':item.get('created_at'),
- 'text': pq(item.get("text")).text(), # 仅提取内容中的文本
- 'attitudes': item.get('attitudes_count'),
- 'comments': item.get('comments_count'),
- 'reposts': item.get('reposts_count')
- }
- yield data
-
-
- if __name__ == '__main__':
- fw = open('./weibo.txt','w',encoding='UTF-8') #写入txt文件中
- for page in range(1, 10): # 抓取前十页的数据
- json_r = get_single_page(page)
- results = parse_page(json_r)
- for result in results:
- print(result)
- fw.writelines(json.dumps((result),ensure_ascii=False))
- fw.writelines('\n')
- fw.close()
其中value与containerid数据为进入所需要爬取微博号页面,右键->查看网页源代码中所对应数据。
value = oid对应值,containerid = page_id对应值。
运行结果:
2)读取weibo.txt文本中数据写入weibo.xls中,具体代码解析:
- def getExcel():
- # 创建excel工作表
- workbook = xlwt.Workbook(encoding='utf-8')
- worksheet = workbook.add_sheet('Weibo')
-
- # 设置表头
- worksheet.write(0, 0, label='id')
- worksheet.write(0, 1, label='created_at')
- worksheet.write(0, 2, label='text')
- worksheet.write(0, 3, label='attitudes')
- worksheet.write(0, 4, label='comments')
- worksheet.write(0, 5, label='reposts')
-
- # 读取json文件
- data = []
- with open('./weibo.txt', 'r', encoding='UTF-8') as f:
- for i in f.readlines():
- data.append(json.loads(i))
- # 将json字典写入excel
- # 变量用来循环时控制写入单元格,感觉有更好的表达方式
- val = 1
- for list_item in data:
- for key, value in list_item.items():
- if key == "id":
- worksheet.write(val, 0, value)
- elif key == "created_at":
- worksheet.write(val, 1, value)
- elif key == "text":
- worksheet.write(val, 2, value)
- elif key == "attitudes":
- worksheet.write(val, 3, value)
- elif key == "comments":
- worksheet.write(val, 4, value)
- elif key == "reposts":
- worksheet.write(val, 5, value)
- val += 1
- # 保存
- workbook.save('./weibo.xls')
运行结果:
3)代码整合完整实现如下:
- # -*- coding: utf-8 -*-
- import xlwt
- import requests
- from urllib.parse import urlencode
- from pyquery import PyQuery as pq
- from openpyxl import Workbook,load_workbook
- import json
-
- host = 'm.weibo.cn'
- base_url = 'https://%s/api/container/getIndex?' % host
- user_agent = 'User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1 wechatdevtools/0.7.0 MicroMessenger/6.3.9 Language/zh_CN webview/0'
-
- headers = {
- 'Host': host,
- 'Referer': 'https://m.weibo.cn/u/XXX',
- 'User-Agent': user_agent
- }
-
- # 按页数抓取数据
- def get_single_page(page):
- params = {
- 'type': 'uid',
- 'value': XXX,
- 'containerid': YYY,
- 'page': page
- }
- url = base_url + urlencode(params)
- try:
- response = requests.get(url, headers=headers)
- if response.status_code == 200:
- return response.json()
- except requests.ConnectionError as e:
- print('抓取错误', e.args)
-
- # 解析页面返回的json数据
- def parse_page(json):
- items = json.get('data').get('cards')
- for item in items:
- item = item.get('mblog')
- if item:
- data = {
- 'id': item.get('id'),
- 'created_at':item.get('created_at'),
- 'text': pq(item.get("text")).text(), # 仅提取内容中的文本
- 'attitudes': item.get('attitudes_count'),
- 'comments': item.get('comments_count'),
- 'reposts': item.get('reposts_count')
- }
- yield data
-
- def getTXT():
- """写入txt文件中"""
- fw = open('./weibo.txt','w',encoding='UTF-8')
- for page in range(1, 10): # 抓取前十页的数据
- json_r = get_single_page(page)
- results = parse_page(json_r)
- for result in results:
- print(result)
- fw.writelines(json.dumps((result),ensure_ascii=False))
- fw.writelines('\n')
- fw.close()
-
- def getExcel():
- # 创建excel工作表
- workbook = xlwt.Workbook(encoding='utf-8')
- worksheet = workbook.add_sheet('罗小黑')
-
- # 设置表头
- worksheet.write(0, 0, label='id')
- worksheet.write(0, 1, label='created_at')
- worksheet.write(0, 2, label='text')
- worksheet.write(0, 3, label='attitudes')
- worksheet.write(0, 4, label='comments')
- worksheet.write(0, 5, label='reposts')
-
- # 读取json文件
- data = []
- with open('./weibo.txt', 'r', encoding='UTF-8') as f:
- for i in f.readlines():
- data.append(json.loads(i))
- # 将json字典写入excel
- # 变量用来循环时控制写入单元格,感觉有更好的表达方式
- val = 1
- for list_item in data:
- for key, value in list_item.items():
- if key == "id":
- worksheet.write(val, 0, value)
- elif key == "created_at":
- worksheet.write(val, 1, value)
- elif key == "text":
- worksheet.write(val, 2, value)
- elif key == "attitudes":
- worksheet.write(val, 3, value)
- elif key == "comments":
- worksheet.write(val, 4, value)
- elif key == "reposts":
- worksheet.write(val, 5, value)
- val += 1
- # 保存
- workbook.save('./weibo.xls')
-
- if __name__ == '__main__':
- getTXT()
- getExcel()