python爬蟲學習筆記...

本文採用m.weibo.cn站點完成抓取通過分析api提取數據,數據存儲在MongoDB中。

爬蟲文件

# -*- coding: utf-8 -*-
import scrapy
from scrapy import Request
from ..items import *

import json
from pyquery import PyQuery as pq

class WeiboSpiderSpider(scrapy.Spider):
name = weibo_spider
allowed_domains = [m.weibo.cn]
# start_urls = [http://m.weibo.cn/]

# 用戶
user_url = https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&value={uid}&containerid=100505{uid}
# 微博
weibo_url = https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&page={page}&containerid=107603{uid}
# 關注
follow_url = https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{uid}&page={page}
# 粉絲 注意 粉絲頁碼參數是since_id=,而不是關注頁碼中page=
fan_url = https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_{uid}&since_id={page}

start_uids = [
2803301701, # 人民日報
1699432410, # 新華社
1974576991, # 環球時報
5476386628, # 俠客島
]

def start_requests(self):
for uid in self.start_uids:
yield Request(self.user_url.format(uid=uid), callback=self.parse_user)

我們首先修改Spider,配置各個Ajax 的URL ,選取幾個大V ,將他們的ID賦值成一個列表,重寫start_requests ( )方法,也就是依次抓取各個大V的個人詳情頁,然後用parse_user( )進行解析

items.py

from scrapy import Item, Field

class UserItem(Item):
collection = users

id = Field() # 用戶id
name = Field() # 暱稱
profile_image = Field() # 頭像圖片
cover_image = Field() # 背景圖片
verified_reason = Field() # 認證
description = Field() # 簡介
fans_count = Field() # 粉絲數
follows_count = Field() # 關注數
weibos_count = Field() # 微博數
mbrank = Field() # 會員等級
verified = Field() # 是否認證
verified_type = Field() # 認證類型
verified_type_ext = Field() # 以下不知道
gender = Field()
mbtype = Field()
urank = Field()
crawled_at = Field() # 抓取時間戳 在pipelines.py中

class UserRelationItem(Item):
collection = UserRelation

id = Field()
follows = Field()
fans = Field()

class WeiboItem(Item):
collection = weibos

id = Field()
idstr = Field()
edit_count = Field()
created_at = Field()
version = Field()
thumbnail_pic = Field()
bmiddle_pic = Field()
original_pic = Field()
source = Field()
user = Field()
text = Field()
crawled_at = Field()

這裡定義了collection 欄位,指明保存的Collection的名稱。用戶的關注和粉絲列表直接定義為一個單獨的UserRelationitem ,其中id 就是用戶的ID, follows 就是用戶關注列表, fans 是粉絲列表。

提取數據

接著提取數據,解析用戶信息,實現parse_user( )方法

# 解析用戶信息
def parse_user(self, response):

self.logger.debug(response)
result = json.loads(response.text)
if result.get(data).get(userInfo):
user_info = result.get(data).get(userInfo)
user_item = UserItem()
user_item[id] = user_info.get(id) # 用戶id
user_item[name] = user_info.get(screen_name) # 暱稱
user_item[profile_image] = user_info.get(profile_image_url) # 頭像圖片
user_item[cover_image] = user_info.get(profile_image_url) # 背景圖片
user_item[verified_reason] = user_info.get(verified_reason) # 微博認證
user_item[description] = user_info.get(description) # 簡介
user_item[weibos_count] = user_info.get(statuses_count) # 微博數
user_item[fans_count] = user_info.get(followers_count) # 粉絲
user_item[follows_count] = user_info.get(follow_count) # 關注數
user_item[mbrank] = user_info.get(mbrank) # 會員等級
user_item[verified] = user_info.get(verified) # 是否認證
user_item[verified_type] = user_info.get(verified_type) # 認證類型
user_item[verified_type_ext] = user_info.get(verified_type_ext) # 以下不知道是啥
user_item[gender] = user_info.get(gender)
user_item[mbtype] = user_info.get(mbtype)
user_item[urank] = user_info.get(urank)

yield user_item

uid = user_info.get(id)
# 關注
yield Request(self.follow_url.format(uid=uid, page=1), callback=self.parse_follows,
meta={page: 1, uid: uid})
# 粉絲
yield Request(self.fan_url.format(uid=uid, page=1), callback=self.parse_fans,
meta={page: 1, uid: uid})
# 微博
yield Request(self.weibo_url.format(uid=uid, page=1), callback=self.parse_weibos,
meta={page: 1, uid: uid})

在這裡一共完成了兩個操作。

解析JSON 提取用戶信息並生成UserItem返回;

構造用戶的關注、粉絲、微博的第一頁的鏈接,井生成Request ,這裡需要的參數只有用戶的ID 。另外,初始分頁頁碼直接設置為『1』即可。

接下來實現解析所發所有微博的方法 parse_weibos( )

# 解析微博列表
def parse_weibos(self, response):

result = json.loads(response.text)
if result.get(ok) and result.get(data).get(cards):
weibos = result.get(data).get(cards)
for weibo in weibos:
mblog = weibo.get(mblog)
# 判斷是否存在mblog,有時不存在
if mblog:
weibo_item = WeiboItem()

weibo_item[id] = mblog.get(id) # 微博id
weibo_item[idstr] = mblog.get(idstr)
weibo_item[edit_count] = mblog.get(edit_count)
weibo_item[created_at] = mblog.get(created_at)
weibo_item[version] = mblog.get(version)
weibo_item[thumbnail_pic] = mblog.get(thumbnail_pic)
weibo_item[bmiddle_pic] = mblog.get(bmiddle_pic)
weibo_item[original_pic] = mblog.get(original_pic)
weibo_item[source] = mblog.get(source)
weibo_item[user] = response.meta.get(uid) # 用戶id

# 檢測有沒有閱讀全文:
all_text = mblog.get(text)
if >全文< in all_text:
# 微博全文頁面鏈接
all_text_url = https://m.weibo.cn/statuses/extend?id= + mblog.get(id)
yield Request(all_text_url, callback=self.parse_all_text, meta={item: weibo_item})

# 判斷是否是轉發微博
elif pq(mblog.get(text)).text() == 轉發微博:
if >全文< in mblog.get(retweeted_status).get(text):
# 微博全文頁面鏈接
all_text_url2 = https://m.weibo.cn/statuses/extend?id= + mblog.get(
retweeted_status).get(id)
yield Request(all_text_url2, callback=self.parse_all_text, meta={item: weibo_item})
else:
weibo_item[text] = pq(mblog.get(retweeted_status).get(text)).text().replace(
, )
yield weibo_item

else:
weibo_item[text] = pq(mblog.get(text)).text().replace(
, )
yield weibo_item

# 下一頁微博
uid = response.meta.get(uid)
page = response.meta.get(page) + 1
yield Request(self.weibo_url.format(uid=uid, page=page), callback=self.parse_weibos,
meta={uid: uid, page: page})

# 有全文的情況,獲取全文
def parse_all_text(self, response):
result = json.loads(response.text)
if result.get(ok) and result.get(data):
all_text = result.get(data).get(longTextContent)
weibo_item = response.meta[item]
weibo_item[text] = pq(all_text).text().replace(
, )
# print(weibo_item[text])
yield weibo_item

在這裡一共完成了兩個操作。

一、解析JSON 提取微博信息並生成WeiboItem返回

解析微博內容text的時候分三種情況:

1.所發微博內容較長,微博內容中包含微博全文鏈接,如果有,進入到parse_all_text( )方法中獲取全文;

2.提取到的text內容為轉發微博,獲取所轉發微博的內容,如果所轉發的內容含有全文鏈接,進入到parse_all_text( )方法中獲取全文,否則直接獲取;

# 判斷是否是轉發微博
elif pq(mblog.get(text)).text() == 轉發微博:
if >全文< in mblog.get(retweeted_status).get(text):
# 微博全文頁面鏈接
all_text_url2 = https://m.weibo.cn/statuses/extend?id= + mblog.get(
retweeted_status).get(id)
yield Request(all_text_url2, callback=self.parse_all_text, meta={item: weibo_item})
else:
weibo_item[text] = pq(mblog.get(retweeted_status).get(text)).text().replace(
, )
yield weibo_item

如果不想獲取轉發微博中的原微博內容,上面這段直接注釋掉就好。

3.既不含全文鏈接,提取到的微博內容也不是轉發微博,那直接獲取微博內容。

二、構造用戶微博的下一頁鏈接

接下來解析解析用戶關注列表和粉絲列表,原理相同

# 解析用戶關注列表
def parse_follows(self, response):

result = json.loads(response.text)
if result.get(ok) and result.get(data).get(cards) and len(result.get(data).get(cards)) and result.get(
data).get(cards)[-1].get(card_group):
# 解析用戶
follows = result.get(data).get(cards)[-1].get(card_group)
# for follow in follows:
# if follow.get(user):
# uid = follow.get(user).get(id)
# yield Request(self.user_url.format(uid=uid), callback=self.parse_user)

uid = response.meta.get(uid)
# 關注列表
user_relation_item = UserRelationItem()
follows = [{id: follow.get(user).get(id), name: follow.get(user).get(screen_name)} for follow in
follows]
user_relation_item[id] = uid
user_relation_item[follows] = follows
user_relation_item[fans] = []
yield user_relation_item
# 下一頁關注
page = response.meta.get(page) + 1
yield Request(self.follow_url.format(uid=uid, page=page),
callback=self.parse_follows, meta={page: page, uid: uid})

# 解析用戶粉絲列表
def parse_fans(self, response):

result = json.loads(response.text)
if result.get(ok) and result.get(data).get(cards) and len(result.get(data).get(cards)) and result.get(
data).get(cards)[-1].get(card_group):
# 解析用戶
fans = result.get(data).get(cards)[-1].get(card_group)
# for fan in fans:
# if fan.get(user):
# uid = fan.get(user).get(id)
# yield Request(self.user_url.format(uid=uid), callback=self.parse_user)

uid = response.meta.get(uid)
# 粉絲列表
user_relation_item = UserRelationItem()
fans = [{id: fan.get(user).get(id), name: fan.get(user).get(screen_name)} for fan in
fans]
user_relation_item[id] = uid
user_relation_item[fans] = fans
user_relation_item[follows] = []
yield user_relation_item
# 下一頁粉絲
page = response.meta.get(page) + 1
yield Request(self.fan_url.format(uid=uid, page=page),
callback=self.parse_fans, meta={page: page, uid: uid})

按照上面的代碼來的話,大概的思路就是以微博的幾個大V為起始點,爬取他們各自用戶信息、所發微博信息、他們各自的關注和粉絲列表。

如果想實現遞歸爬取,再獲取關注和粉絲列表的用戶信息、關注和粉絲列表,以此類推爬下去,只要把上面幾行注釋掉的加上即可

# for follow in follows:
# if follow.get(user):
# uid = follow.get(user).get(id)
# yield Request(self.user_url.format(uid=uid), callback=self.parse_user)

# for fan in fans:
# if fan.get(user):
# uid = fan.get(user).get(id)
# yield Request(self.user_url.format(uid=uid), callback=self.parse_user)

這樣就會遞歸下去,如果一個用戶與其他用戶有社交網路上的關聯,那他們的信息就會被爬蟲抓取到,這樣我們就可以做到對所有用戶的爬取。

數據清洗

pipelines.py

import re, time

from Weibo.items import *

class TimePipeline():
def process_item(self, item, spider):
if isinstance(item, UserItem) or isinstance(item, WeiboItem):
now = time.strftime(%Y-%m-%d %H:%M, time.localtime())
item[crawled_at] = now
return item

在Spider裏沒有對crawled_at 欄位賦值,它代表爬取時間,我們可以統一將其賦值為當前時間,實現如上述中class TimePipeline( )

有些微博的時間可能不是標準的時間,比如它可能顯示為剛剛、幾分鐘前、幾小時前、昨天等。這裡需要統一轉化這些時間:

class WeiboPipeline():
def parse_time(self, date):
if re.match(剛剛, date):
date = time.strftime(%Y-%m-%d %H:%M, time.localtime(time.time()))
if re.match(d+分鐘前, date):
minute = re.match((d+), date).group(1)
date = time.strftime(%Y-%m-%d %H:%M, time.localtime(time.time() - float(minute) * 60))
if re.match(d+小時前, date):
hour = re.match((d+), date).group(1)
date = time.strftime(%Y-%m-%d %H:%M, time.localtime(time.time() - float(hour) * 60 * 60))
if re.match(昨天.*, date):
date = re.match(昨天(.*), date).group(1).strip()
date = time.strftime(%Y-%m-%d, time.localtime(time.time() - 24 * 60 * 60)) + + date
if re.match(d{2}-d{2}, date):
date = time.strftime(%Y-, time.localtime()) + date + 00:00
return date

def process_item(self, item, spider):
if isinstance(item, WeiboItem):
if item.get(created_at):
item[created_at] = item[created_at].strip()
item[created_at] = self.parse_time(item.get(created_at))
if item.get(pictures):
item[pictures] = [pic.get(url) for pic in item.get(pictures)]
return item

實現一個parse_time( )方法來轉化時間,在process_item( )中進行處理。

數據儲存

pipelines.py

在pipelines.py中還要進行一個重要步驟,數據清洗完畢之後,將數據保存到MongoDB資料庫中

import pymongo

class MongoPipeline(object):
def __init__(self, local_mongo_host, local_mongo_port, mongo_db):
self.local_mongo_host = local_mongo_host
self.local_mongo_port = local_mongo_port
self.mongo_db = mongo_db

@classmethod
def from_crawler(cls, crawler):

return cls(
local_mongo_host=crawler.settings.get(LOCAL_MONGO_HOST),
local_mongo_port=crawler.settings.get(LOCAL_MONGO_PORT),
mongo_db=crawler.settings.get(DB_NAME)
)

def open_spider(self, spider):
self.client = pymongo.MongoClient(self.local_mongo_host, self.local_mongo_port)
# 資料庫名
self.db = self.client[self.mongo_db]
# 以Item中collection命名 的集合(資料庫表) 添加index
self.db[UserItem.collection].create_index([(id, pymongo.ASCENDING)])
self.db[WeiboItem.collection].create_index([(id, pymongo.ASCENDING)])
self.db[UserRelationItem.collection].create_index([(id, pymongo.ASCENDING)])

def close_spider(self, spider):
self.client.close()

def process_item(self, item, spider):
if isinstance(item, UserItem) or isinstance(item, WeiboItem):
self.db[item.collection].update({id: item.get(id)},
{$set: item},
True)
if isinstance(item, UserRelationItem):
self.db[item.collection].update(
{id: item.get(id)},
{$addToSet:
{
follows: {$each: item[follows]},
fans: {$each: item[fans]}
}
},
True)
return item

這裡需要注意幾點

1.open_ spider( )方法裏添加了Collection 的索引,這裡為Item 都添加了索引,索引的欄位是id 。由於我們這次是大規模爬取,爬取過程涉及數據的更新問題,所以我們為每個Collection建立了索引,這樣可以大大提高檢索效率。

2.在process_item( )方法裏存儲使用的是update( )方法,第一個參數是查詢條件,第二個參數是爬取的Item 。這裡我們使用了$set 操作符,如果爬取到重複的數據即可對數據進行更新,同時不會刪除已存在的欄位。如果這裡不加$set 操作符,那麼會直接進行item 替換,這樣可能會導致已存在的欄位清空。第三個參數設置為True ,如果數據不存在,則插入數據。這樣我們就可以做到數據存在即更新、數據不存在即插入,從而獲得去重的效果。

3.對於用戶的關注和粉絲列表,我們使用了一個新的操作符,叫作$addToSet ,這個操作符可以向列表類型的欄位插入數據同時去重。它的值就是需要操作的欄位名稱。這裡利用了$each操作符對需要插入的列表數據進行了遍歷,以逐條插入用戶的關注或粉絲數據到指定的欄位。

middlewares.py

Cookies 池對接

# cookie池
class CookiesMiddleware():
def __init__(self, cookies_url):
self.logger = logging.getLogger(__name__)
self.cookies_url = cookies_url

def get_random_cookies(self):
try:
response = requests.get(self.cookies_url)
if response.status_code == 200:
cookies = json.loads(response.text)
return cookies
except requests.ConnectionError:
return False

def process_request(self, request, spider):
self.logger.debug(正在獲取Cookies)
cookies = self.get_random_cookies()
if cookies:
request.cookies = cookies
self.logger.debug(使用Cookies + json.dumps(cookies))

@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
return cls(
cookies_url=settings.get(COOKIES_URL)
)

明人不說暗話,Cookies 池見:Python3WebSpider/CookiesPool

我自己使用時,有一些驗證碼識別困難,我屏蔽了驗證碼識別部分,添加了 點擊登錄後 等待10s時間,10s鍾內自己手動點擊驗證碼

自己試驗的時候添加了10個賬號,如果添加大量賬號,可以添加識別驗證碼平臺,在Github上可以找到類似項目。(我用的時候把埠改為了API_PORT = 5005)

settings.py

BOT_NAME = Weibo

SPIDER_MODULES = [Weibo.spiders]
NEWSPIDER_MODULE = Weibo.spiders

ROBOTSTXT_OBEY = False

DEFAULT_REQUEST_HEADERS = {
Accept: application/json, text/plain, */*,
Accept-Encoding: gzip, deflate, sdch,
Accept-Language: zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2,
Connection: keep-alive,
Host: m.weibo.cn,
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36,
X-Requested-With: XMLHttpRequest,
}

DOWNLOADER_MIDDLEWARES = {
Weibo.middlewares.CookiesMiddleware: 554,
Weibo.middlewares.ProxyMiddleware: 555,
}

ITEM_PIPELINES = {
Weibo.pipelines.TimePipeline: 300,
Weibo.pipelines.WeiboPipeline: 301,
Weibo.pipelines.MongoPipeline: 302,
}

# 這個設置項的意思是遇到這些錯誤碼就重新發送請求
RETRY_HTTP_CODES = [401, 403, 408, 414, 500, 502, 503, 504]

# MongoDb 配置
LOCAL_MONGO_HOST = 127.0.0.1
LOCAL_MONGO_PORT = 27017
DB_NAME = mweibocn
# cookie池
COOKIES_URL = http://localhost:5005/weibo/random
# ip代理 池
PROXY_URL = http://127.0.0.1:5000/proxy/target/weibo/cn


爬取到的部分內容

完結撒花,後續會加入ip代理池,苦於找不到高效穩定的代理池,有的又不會使用?(?ω?)?


添加了IP代理池

用的這位老師的Github源碼:01ly/FooProxy

對接到Scrapy中

import json
import logging
import requests
import random

# 代理ip
class ProxyMiddleware():
def __init__(self, proxy_url):
self.logger = logging.getLogger(__name__)
self.proxy_url = proxy_url

def get_random_proxy(self):
try:
# response = requests.get(self.proxy_url)
# if response.status_code == 200:
# proxy = response.text

response = requests.get(self.proxy_url)
if response.status_code == 200:
proxy_d = random.choice(json.loads(response.text))
ip = proxy_d.get(ip)
port = proxy_d.get(port)
proxy = ip + : + port

return proxy
except requests.ConnectionError:
return False

def process_request(self, request, spider):
# if request.meta.get(retry_times):
proxy = self.get_random_proxy()
if proxy:
uri = https://{proxy}.format(proxy=proxy)
self.logger.debug(使用代理 + proxy)
request.meta[proxy] = uri

@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
return cls(
proxy_url=settings.get(PROXY_URL)
)

目前還是有問題的,爬到人民日報的粉絲列表250頁+後的api失效了

Github源碼:Ingram7/Weibo


推薦閱讀:
相關文章