beautifulSoup,threading.Thread 多線程採集

from urllib import request

from bs4 import BeautifulSoupimport pymysqlimport threadingconfig ={

"target_lst_first":"http://www.ygdy8.net/html/gndy/dyzz/list_23_",

"prex":".html", "page":1, "target_host":"http://www.ygdy8.net"}headers = {User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36}class Movie(threading.Thread): def __init__(self,disc_): self.disc_ = disc_ self.cursor = None

self.mysql()

self.switch_a = True # def go_on(self): pass def mysql(self): if self.cursor == None: connect = pymysql.Connect( host=********,

port=3306,

user=root, passwd=********, db=movies, charset=utf8 ) self.cursor = connect.cursor() return self.cursor def begin(self):

for i in range(1,201):

url = self.disc_[target_lst_first] + str(i) + self.disc_[prex] t = threading.Thread(target=self.lst_rule,args=(url,i,)) t.start() def lst_rule(self,url,page): req = request.Request(url,headers=headers) http = request.urlopen(req) html = http.read().decode("gbk",ignore)

soup = BeautifulSoup(html,"html.parser")

a_lst = soup.select(.ulink) if len(a_lst)>0: release_times = soup.select(font[color="#8F8C89"]) time_list = [] for i in release_times: time_list.append(i.string[3:22]) time_num = 0 for i in a_lst: print("第%s頁執行次數:%s" % (page,time_num))

href = i.attrs[href]

try: if time_num < len(time_list): time = time_list[time_num] time_num += 1 except: print(time_num) print(time_list) pass detail_url = self.disc_[target_host]+href

result = self.detail_rule(detail_url)

result[time] = time # 進入mysql入庫 with open("./result.txt","a+",encoding=utf-8) as f: f.write(str(result)) f.close() else: print(該頁面沒有a標籤的列表) self.switch_a = False pass

return html

def detail_rule(self,url): req = request.Request(url, headers=headers) httpresponse = request.urlopen(req) html = httpresponse.read().decode("gbk",ignore) # with open("d.txt",rb) as f: # html = f.read() # f.close() soup = BeautifulSoup(html, "html.parser")

titles = soup.select("h1 font")

dicts = { "title": , "link": , "con": , } try: if len(titles) > 0: title = titles[0].string else: title = ftp_as = soup.select("table tr td a") if len(ftp_as) > 0: link = ftp_as[0].attrs[href] else: link = uls = soup.select(".co_content8 ul") if len(uls) > 0: con = uls[0] else: con = dicts = { "title":title, "link": link, "con": con, } except Exception as e: with open(./error1.txt,wb) as f: print(e) f.close() dicts = { "title": , "link": , "con": , } pass return dicts def get_html(self): pass def parse_html(self): pass def mysql_insert(self): pass def mysql_find(self): passmove = Movie(config)move.begin()
推薦閱讀：