<>相关库
import pymysql import pymysql.cursors from bs4 import BeautifulSoup import
requestsimport random import time from selenium import webdriver from selenium.
webdriver.common.by import By from selenium.webdriver.support.ui import
WebDriverWaitfrom selenium.webdriver.support import expected_conditions as EC
import codecs from selenium.common.exceptions import TimeoutException
<>从数据库中读取车型(车型已经存放再数据库,这里读取车型的id,拼接到url上)
cars = [] conn = pymysql.connect(host='*******',charset='utf8',user=*******
',passwd='*****',db='mysql',cursorclass=pymysql.cursors.DictCursor) try: cur =
conn.cursor() cur.execute("USE data_etl") cur.execute("select
distinct(car_id),car_name from user_car_port") item = cur.fetchone() count = 0
while item is not None: cars.append(item) count+=1 item = cur.fetchone() print(
count) finally: conn.close()
<>由于汽车之家反爬比较复杂,我们直接调用浏览器接口
driver = webdriver.Chrome('chromedriver.exe') def getCarPriceOffSale(innerHtml)
: button = 0.0 top = 0.0 print("此车型已经停售!") bsObj = BeautifulSoup(innerHtml) try:
spanPrice= bsObj.findAll("span",{"class":"price"})[0] if spanPrice is not None:
strongPrice= spanPrice.find("strong",{"class":"red"}) if strongPrice is not
None: text = strongPrice.text if text is not None: prices = text.split("-")
prices= text.split("-") prices[0] = prices[0].replace("万","") prices[0] = prices
[0].replace("元","") button = float(prices[0]) if(len(prices) == 2): prices[1] =
prices[1].replace("万","") prices[1] = prices[1].replace("元","") top = float(
prices[1]) else: top = button else: print("价格字段为空") else: print("价格strong为空")
else: print("价格span为空") except Exception: print("程序出错!停售车型") return button,top
<>处理在售车型的价格 信息
def getCarPriceOnSale(innerHtml): button = 0.0 top = 0.0 print("此车型在售") bsObj =
BeautifulSoup(innerHtml) try: ddprice = bsObj.findAll("dd")[0] if ddprice is
not None: a = ddprice.find("a",{"class":"emphasis"}) if a is not None: text = a.
text prices= text.split("-") prices[0] = prices[0].replace("万","") prices[0] =
prices[0].replace("元","") button = float(prices[0]) if(len(prices) == 2): prices
[1] = prices[1].replace("万","") prices[1] = prices[1].replace("元","") top =
float(prices[1]) else: top = button else: print("此车型暂时无法查询价格") except Exception:
print("程序出错!在售车型") return button,top
<>处理停售车型的价格信息
def getCarPrice(carId): button = 0.0 top = 0.0 try: driver.get(url+str(carId))
wait= WebDriverWait(driver,5).until(EC.presence_of_element_located((By.
CLASS_NAME,"information-summary"))) ele = driver.find_element_by_class_name(
"information-price").get_attribute('innerHTML') button,top=getCarPriceOnSale(ele
) except TimeoutException: try: wait = WebDriverWait(driver,5).until(EC.
presence_of_element_located((By.CLASS_NAME,"car_price"))) ele = driver.
find_element_by_class_name("car_price").get_attribute('innerHTML') button,top=
getCarPriceOffSale(ele) except TimeoutException: print("此车型有问题:"+str(carId))
return button,top
<>遍历数据库所有车型的id
for car in cars: id = car["car_id"] time.sleep(random.randint(1,5)) button,top
= getCarPrice(id) if button == 0.0 and top == 0.0: car["button"] = 9999 car[
"top"] = 9999 else: car["button"] = button car["top"] = top
热门工具 换一换