アマゾンコメント

GITHUBでアマゾンのコメントのスパイダーを使いました、商品のコメントの統計が出てきます。
  

import time import requests import re from selenium.webdriver.common.by import By # import utility.generate_headers from bs4 import BeautifulSoup from selenium import webdriver import csv with open("read_link.txt","r",encoding = "utf-8") as file: s = file.read().split("\n") driver = webdriver.Chrome(executable_path='chromedriver.exe') num = 0 for i in s: # if num==0: # num += 1 # continue if len(i)<5: continue print(i) sign ="""""" try: #提取sign值 sign = re.findall("dp/(.*?)",i)[0] except: pass if len(sign) <5: sign = re.findall("dp/(.*?)\?",i)[0] print("当前爬取的id 是: {}".format(sign)) csv_write = csv.writer(open(f"{sign}.csv","w",newline="",encoding="utf-8")) csv_write.writerow(["user","content","times","scores"]) num0 = 1 fore_url = i.split(r"/dp/")[0] print(fore_url) for j in range(1,150): if num0 == 1: url = f"{fore_url}/product-reviews/{sign}/ref=cm_cr_getr_d_paging_btm_prev_{j}?ie=UTF8&reviewerType=all_reviews&pageNumber={j}" else: url = f"{fore_url}/product-reviews/{sign}/ref=cm_cr_arp_d_paging_btm_next_{num0}?ie=UTF8&reviewerType=all_reviews&pageNumber={num0}" print(url) driver.get(url) content = driver.page_source soup = BeautifulSoup(content,'lxml') names = [] comments = [] scores = [] times = [] all = soup.find_all(attrs = {"data-hook":"review"}) for h in all: h = str(h) print(h) h = h.replace("\n","") h = h.replace(" ","") h = h.replace(" ","") scores.append(re.findall('a-icon-alt">(.*?)(.*?)(.*?)',h)[0]) cuts = h.split('review-text-content" data-hook="review-body">')[-1].split('review-comments comments-f')[0].split("<")[0] comments.append(cuts) need = [] if len(names)==0: if "Tut uns Leid!" in content: break if "a-size-medium view-point-title" not in content: while True: if "a-size-medium view-point-title" not in driver.page_source: j-=1 continue break time.sleep(1) print(len(names),len(scores),len(times),len(comments)) if len(names) ==0: break for i in range(len(comments)): need.append([names[i],comments[i],times[i],scores[i]]) print(names[i],scores[i],times[i],comments[i]) print(len(names),len(scores),len(times),len(comments)) print(url) csv_write.writerows(need)


しかし、リングが必要である、VSCODEでできなくて、他のアプリでやりました。
ソース:github
利用するビデオ:youtube