Thursday, April 14, 2022

[FIXED] How to add a timer to calculate my code execution

April 14, 2022 python, selenium, timer No comments

Issue

I have written a python script which scraps products from aliexpress.

Here is my code :

from selenium.webdriver.edge.options import Options  
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver  
from pymongo import MongoClient
from time import sleep
from lxml import html 
import pandas as pd
import cssselect
import pymongo
import json 
import csv 


options = Options()
options.headless = True
driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress_scrap\scrap\codes\msedgedriver",options=options)
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth+earphones&ltype=wholesale&SortType=default&page={}'
baseurl = 'https://www.aliexpress.com'

for page_nb in range(1, 2):
    print('---', page_nb, '---')
    
    driver.get(url.format(page_nb))
    sleep(2)
    current_offset = 0
    while True:
        driver.execute_script("window.scrollBy(0, window.innerHeight);")
        sleep(.5)  # JavaScript has time to add elements
        new_offset = driver.execute_script("return window.pageYOffset;")
        print(new_offset,current_offset)
        if new_offset <= current_offset:
            break
        current_offset = new_offset
    
    sleep(3)
    
    tree = html.fromstring(driver.page_source)
    
    results = []
    
    for product in tree.xpath('//div[@class="JIIxO"]//a'):
        title = product.xpath('.//h1/text()')
        
        if title:
            title = title[0]
            
            price = product.cssselect('div.mGXnE._37W_B span')
            price = [x.text for x in price]

            currency = price[0]
            price = ''.join(price[1:])
            stars = product.xpath('.//span[@class="eXPaM"]/text()')
            if stars :
                stars  = stars [0]
            else:
                stars  = 'None'
                
            nb_sold = product.xpath('.//span[@class="_1kNf9"]/text()')
            if nb_sold:
                nb_sold = nb_sold[0]
            else:
                nb_sold = 'None'
            supl = product.xpath('.//a[@class="ox0KZ"]/text()')
            if supl:
                supl = supl[0]
            else:
                supl = 'None'

            ship_cost = product.xpath('.//span[@class="_2jcMA"]/text()')
            if ship_cost:
                ship_cost = ship_cost[0]
            else:
                ship_cost = 'None'
            
            product_links = product.xpath('./@href')
            if product_links:
                product_links = str(baseurl) + str( product_links[0])
            
            row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links]
            results.append(row)
            print('len(results):', len(results))

    driver.close()
df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks" ))

####### Insert in database #############
client = MongoClient("mongodb://localhost:27017/")     
collection = client['db2']['aliex2']     
data = df.to_dict(orient = 'records')     
collection.insert_many(data)

My question :

What I need is to add a timer that calculate the time of process and returns a value to know how much time takes the code. And also I want a method in order ro get a new collection after each scraping because when I run my code the second time, I get my data with the old collection.

I appreciate any help from you . Thank you !

Solution

May your problem solve in below code:

from selenium.webdriver.edge.options import Options  
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver  
from pymongo import MongoClient
from time import sleep
from lxml import html 
import pandas as pd
import cssselect
import pymongo
import json 
import csv 
import time as Time


options = Options()
options.headless = True
driver = webdriver.Edge(executable_path=r"C:\Users\aicha\Desktop\mycode\aliexpress_scrap\scrap\codes\msedgedriver",options=options)
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth+earphones&ltype=wholesale&SortType=default&page={}'
baseurl = 'https://www.aliexpress.com'

for page_nb in range(1, 2):
    print('---', page_nb, '---')
    
    driver.get(url.format(page_nb))
    sleep(2)
    current_offset = 0
    while True:
        driver.execute_script("window.scrollBy(0, window.innerHeight);")
        sleep(.5)  # JavaScript has time to add elements
        new_offset = driver.execute_script("return window.pageYOffset;")
        print(new_offset,current_offset)
        if new_offset <= current_offset:
            break
        current_offset = new_offset
    
    sleep(3)
    
    tree = html.fromstring(driver.page_source)
    
    results = []
    
    for product in tree.xpath('//div[@class="JIIxO"]//a'):
        start_time = Time.time()
        title = product.xpath('.//h1/text()')
        
        if title:
            title = title[0]
            
            price = product.cssselect('div.mGXnE._37W_B span')
            price = [x.text for x in price]

            currency = price[0]
            price = ''.join(price[1:])
            stars = product.xpath('.//span[@class="eXPaM"]/text()')
            if stars :
                stars  = stars [0]
            else:
                stars  = 'None'
                
            nb_sold = product.xpath('.//span[@class="_1kNf9"]/text()')
            if nb_sold:
                nb_sold = nb_sold[0]
            else:
                nb_sold = 'None'
            supl = product.xpath('.//a[@class="ox0KZ"]/text()')
            if supl:
                supl = supl[0]
            else:
                supl = 'None'

            ship_cost = product.xpath('.//span[@class="_2jcMA"]/text()')
            if ship_cost:
                ship_cost = ship_cost[0]
            else:
                ship_cost = 'None'
            
            product_links = product.xpath('./@href')
            if product_links:
                product_links = str(baseurl) + str( product_links[0])
            diffrence_time = Time.time() - start_time # calculate time taken by program
            
            row = [title, price, currency, stars, nb_sold, ship_cost, supl, product_links, diffrence_time] #diffrence_time store dataframe
            results.append(row)
            print('len(results):', len(results))

    driver.close()
df = pd.DataFrame(results , columns=("Title","Price", "Currency", "Stars", "Orders", "Shipcost", "Supplier", "Productlinks", "Time Taken"))

####### Insert in database #############
client = MongoClient("mongodb://localhost:27017/")     
collection = client['db2']['aliex2']     
data = df.to_dict(orient = 'records')     
collection.insert_many(data)

I had find out time taken by program in loop and store that diffrence time in dataframe

Answered By - Devam Sanghvi

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Thursday, April 14, 2022

[FIXED] How to add a timer to calculate my code execution

Issue

Solution

0 comments:

Post a Comment

Popular Posts

Labels