from urllib.parse import urljoin from lxml import html import requests import json from time import sleep import random import traceback import cv2 import base64 from application.db import Session, Recipe from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as ec import cv2 from urllib.request import urlopen import numpy as np def getImages(): chromePath = 'C:/tools/chromedriver.exe' chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument("--log-level=3") driver = webdriver.Chrome(chromePath, chrome_options=chrome_options) defaultImagePath = "./data/images.jpeg" image = cv2.imread(defaultImagePath) ret, jpeg = cv2.imencode(".jpeg", image) defaultImg = base64.b64encode(jpeg) errorUrls = [] dbSession = Session() counter = 0 maxC = dbSession.query(Recipe).count() for recipe in dbSession.query(Recipe).all(): try: if defaultImg == recipe.img: url = recipe.url string1 = '//*[@id="recipe-image-carousel"]/div/div[1]/div[10]/div/a/amp-img' //*[@id="recipe-image-carousel"]/div/div[1]/div/div/a/amp-img driver.get(url) element = WebDriverWait(driver, 30).until( ec.presence_of_element_located(( By.XPATH, string1))) src = driver.find_element_by_xpath(string1).get_attribute("src") print(src) resp = urlopen(src) image = np.asarray(bytearray(resp.read()), dtype="uint8") image = cv2.imdecode(image, cv2.IMREAD_COLOR) ret, jpeg = cv2.imencode(".jpg", image) img = base64.b64encode(jpeg) recipe.img = img #dbSession.flush() dbSession.commit() counter +=1 print(counter/maxC) sleep(5) except Exception as e: errorUrls.append(recipe.url) print(recipe.url) print(e) print(errorUrls) getImages()