2022年 11月 6日

Python爬取微博数据

Python爬取微博数据

一、需求说明

       抓取某一用户的新浪微博数据,抓取的内容包括:id号、微博发布的时间、正文(仅提取文字)、转发数、评论数、点赞数。

在返回的json数据结构中,我们需要的是以下字段:

  1. data
  2. cards
  3. mblog
  4. id #id号
  5. created_at # 发布时间
  6. text # 正文
  7. reposts_count # 转发数
  8. comments_count # 评论数
  9. attitudes_count # 点赞数

二、代码实现

        确保安装了requests和pyquery库。

  1. pip install pyquery
  2. pip install requests

       1)爬取微博数据写入weibo.txt文本中,具体代码解析:

  1. import requests
  2. from urllib.parse import urlencode
  3. from pyquery import PyQuery as pq
  4. host = 'm.weibo.cn'
  5. base_url = 'https://%s/api/container/getIndex?' % host
  6. user_agent = 'User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1 wechatdevtools/0.7.0 MicroMessenger/6.3.9 Language/zh_CN webview/0'
  7. headers = {
  8. 'Host': host,
  9. 'Referer': 'https://m.weibo.cn/u/XXX',
  10. 'User-Agent': user_agent
  11. }
  12. # 按页数抓取数据
  13. def get_single_page(page):
  14. params = {
  15. 'type': 'uid',
  16. 'value': XXX,
  17. 'containerid': YYY,
  18. 'page': page
  19. }
  20. url = base_url + urlencode(params)
  21. try:
  22. response = requests.get(url, headers=headers)
  23. if response.status_code == 200:
  24. return response.json()
  25. except requests.ConnectionError as e:
  26. print('抓取错误', e.args)
  27. # 解析页面返回的json数据
  28. def parse_page(json):
  29. items = json.get('data').get('cards')
  30. for item in items:
  31. item = item.get('mblog')
  32. if item:
  33. data = {
  34. 'id': item.get('id'),
  35. 'created_at':item.get('created_at'),
  36. 'text': pq(item.get("text")).text(), # 仅提取内容中的文本
  37. 'attitudes': item.get('attitudes_count'),
  38. 'comments': item.get('comments_count'),
  39. 'reposts': item.get('reposts_count')
  40. }
  41. yield data
  42. if __name__ == '__main__':
  43. fw = open('./weibo.txt','w',encoding='UTF-8') #写入txt文件中
  44. for page in range(1, 10): # 抓取前十页的数据
  45. json_r = get_single_page(page)
  46. results = parse_page(json_r)
  47. for result in results:
  48. print(result)
  49. fw.writelines(json.dumps((result),ensure_ascii=False))
  50. fw.writelines('\n')
  51. fw.close()

   其中value与containerid数据为进入所需要爬取微博号页面,右键->查看网页源代码中所对应数据。

  

  value = oid对应值,containerid = page_id对应值。

运行结果:

        2)读取weibo.txt文本中数据写入weibo.xls中,具体代码解析:

  1. def getExcel():
  2. # 创建excel工作表
  3. workbook = xlwt.Workbook(encoding='utf-8')
  4. worksheet = workbook.add_sheet('Weibo')
  5. # 设置表头
  6. worksheet.write(0, 0, label='id')
  7. worksheet.write(0, 1, label='created_at')
  8. worksheet.write(0, 2, label='text')
  9. worksheet.write(0, 3, label='attitudes')
  10. worksheet.write(0, 4, label='comments')
  11. worksheet.write(0, 5, label='reposts')
  12. # 读取json文件
  13. data = []
  14. with open('./weibo.txt', 'r', encoding='UTF-8') as f:
  15. for i in f.readlines():
  16. data.append(json.loads(i))
  17. # 将json字典写入excel
  18. # 变量用来循环时控制写入单元格,感觉有更好的表达方式
  19. val = 1
  20. for list_item in data:
  21. for key, value in list_item.items():
  22. if key == "id":
  23. worksheet.write(val, 0, value)
  24. elif key == "created_at":
  25. worksheet.write(val, 1, value)
  26. elif key == "text":
  27. worksheet.write(val, 2, value)
  28. elif key == "attitudes":
  29. worksheet.write(val, 3, value)
  30. elif key == "comments":
  31. worksheet.write(val, 4, value)
  32. elif key == "reposts":
  33. worksheet.write(val, 5, value)
  34. val += 1
  35. # 保存
  36. workbook.save('./weibo.xls')

        运行结果:

       

3)代码整合完整实现如下:

  1. # -*- coding: utf-8 -*-
  2. import xlwt
  3. import requests
  4. from urllib.parse import urlencode
  5. from pyquery import PyQuery as pq
  6. from openpyxl import Workbook,load_workbook
  7. import json
  8. host = 'm.weibo.cn'
  9. base_url = 'https://%s/api/container/getIndex?' % host
  10. user_agent = 'User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1 wechatdevtools/0.7.0 MicroMessenger/6.3.9 Language/zh_CN webview/0'
  11. headers = {
  12. 'Host': host,
  13. 'Referer': 'https://m.weibo.cn/u/XXX',
  14. 'User-Agent': user_agent
  15. }
  16. # 按页数抓取数据
  17. def get_single_page(page):
  18. params = {
  19. 'type': 'uid',
  20. 'value': XXX,
  21. 'containerid': YYY,
  22. 'page': page
  23. }
  24. url = base_url + urlencode(params)
  25. try:
  26. response = requests.get(url, headers=headers)
  27. if response.status_code == 200:
  28. return response.json()
  29. except requests.ConnectionError as e:
  30. print('抓取错误', e.args)
  31. # 解析页面返回的json数据
  32. def parse_page(json):
  33. items = json.get('data').get('cards')
  34. for item in items:
  35. item = item.get('mblog')
  36. if item:
  37. data = {
  38. 'id': item.get('id'),
  39. 'created_at':item.get('created_at'),
  40. 'text': pq(item.get("text")).text(), # 仅提取内容中的文本
  41. 'attitudes': item.get('attitudes_count'),
  42. 'comments': item.get('comments_count'),
  43. 'reposts': item.get('reposts_count')
  44. }
  45. yield data
  46. def getTXT():
  47. """写入txt文件中"""
  48. fw = open('./weibo.txt','w',encoding='UTF-8')
  49. for page in range(1, 10): # 抓取前十页的数据
  50. json_r = get_single_page(page)
  51. results = parse_page(json_r)
  52. for result in results:
  53. print(result)
  54. fw.writelines(json.dumps((result),ensure_ascii=False))
  55. fw.writelines('\n')
  56. fw.close()
  57. def getExcel():
  58. # 创建excel工作表
  59. workbook = xlwt.Workbook(encoding='utf-8')
  60. worksheet = workbook.add_sheet('罗小黑')
  61. # 设置表头
  62. worksheet.write(0, 0, label='id')
  63. worksheet.write(0, 1, label='created_at')
  64. worksheet.write(0, 2, label='text')
  65. worksheet.write(0, 3, label='attitudes')
  66. worksheet.write(0, 4, label='comments')
  67. worksheet.write(0, 5, label='reposts')
  68. # 读取json文件
  69. data = []
  70. with open('./weibo.txt', 'r', encoding='UTF-8') as f:
  71. for i in f.readlines():
  72. data.append(json.loads(i))
  73. # 将json字典写入excel
  74. # 变量用来循环时控制写入单元格,感觉有更好的表达方式
  75. val = 1
  76. for list_item in data:
  77. for key, value in list_item.items():
  78. if key == "id":
  79. worksheet.write(val, 0, value)
  80. elif key == "created_at":
  81. worksheet.write(val, 1, value)
  82. elif key == "text":
  83. worksheet.write(val, 2, value)
  84. elif key == "attitudes":
  85. worksheet.write(val, 3, value)
  86. elif key == "comments":
  87. worksheet.write(val, 4, value)
  88. elif key == "reposts":
  89. worksheet.write(val, 5, value)
  90. val += 1
  91. # 保存
  92. workbook.save('./weibo.xls')
  93. if __name__ == '__main__':
  94. getTXT()
  95. getExcel()