网页:https://www.walmart.com/search/?page=2&ps=40&query=pillowcase
我在爬取沃尔玛的商品搜索信息时,每次每个页面只能爬取到前10个商品(每个页面有40个商品)
那个网页也不像是动态加载的网页
不过我用浏览器代开那网页发现个规律,它会先加载10个商品,然后过一两秒会再加载出剩下的商品
求大神指点呀,头好疼好久解决不了,网络上也没找到相关的问题。
我的代码:
import urllib
import urllib.request
import random
import bs4
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from selenium import webdriver
import time as t
import numpy
import datetime
from selenium import webdriver
#定义一个数组存储多个User-Agent
my_headers = [
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"
]
#设置要爬取的网页地址
keyword='Networking+Cables'
def geturl():
url='https://www.walmart.com/search/?grid=true&'
urls=[]
for i in range(1):
if(i==0):
urls.append(url+"query="+keyword)
else:
urls.append(url+"page="+str(i+1)+"&ps=40&"+"query=pillowcase")#组合成新的url地址
return urls
#获取网页链接列表
urls=geturl()
print(urls)
links=[]
wmid=[]
time=[]
locate=[]
ranking=[]
seller=[]
price=[]
price_min=[]
price_max=[]
reviews=[]
star=[]
prodflag=[]
title=[]
#随机获得一个User-Agent
randdom_header = random.choice(my_headers)
a=0
n=1
rank=1
for i in urls:
print("正在解析第"+str(n)+"个页面......")
t.sleep(numpy.random.uniform(0,4))
#使用urllib库解析网页地址
req = urllib.request.Request(i)
#添加头部信息
req.add_header("User-Agent", randdom_header)
req.add_header("GET",i)
#获取网页加载完的信息
response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')
#用bs4库优化网页内容的结构
soup=BeautifulSoup(html,'html.parser')
for li in soup.find('div',id="searchProductResult").find_all('li'):
a+=1
try:reviews.append(li.find('span',class_="stars-reviews-count").span.string)
except:reviews.append(0)
link=("https://www.walmart.com"+li.find('a',class_="product-title-link line-clamp line-clamp-2 truncate-title").attrs['href'])
if(len(link)>30):
link_new=link.split("?")[0]
links.append(link_new)
time.append(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
title.append(link.split('/')[4].replace('-',' '))
wmid.append(link_new.split('/')[-1])
try:prodflag.append(li.find('span',class_="flag-angle__content").string)
except:prodflag.append("None")
locate.append(str(n)+"-"+str(a))
ranking.append(rank)
rank+=1
seller.append(li.find('div',class_="search-result-product-shipping-details gridview").get_text())
try:min_=li.find('span',class_="price-main-block").find_all('span',class_="price price-main")[0].span.string;max_=li.find('span',class_="price-main-block").find_all('span',class_="price price-main")[1].span.string;price.append(min_+"-"+max_);price_min.append(min_);price_max.append(max_)
except:price_=li.find('span',class_="price display-inline-block arrange-fit price price-main").span.string;price.append(price_);price_min.append(price_);price_max.append(price_)
star.append(li.find('span',class_='visuallyhidden seo-avg-rating').string)
print("第"+str(n)+"个页面解析成功!")
n+=1
a=0
table=pd.DataFrame()
table['Link']=links
table['Time']=time
table['WMID']=wmid
table['Locate']=locate
table['Ranking']=ranking
table['Seller']=seller
table['Price']=price;table['Price_min']=price_min;table['Price_max']=price_max
table['Reviews']=reviews
table['Star']=star
table['prodFlag']=prodflag
table['Title']=title
table.to_excel('walmart_spider/'+keyword+'_rank_info.xlsx',sheet_name=keyword,index=False)
print(table)
既然过几秒才会加载出全部内容,那为什么不等几秒在开始爬呢?