Issue
I am building a scraper that will run continuously an auction website. The scraper first scrapes the link of cars then go to each link and check if the car is sold or not. If the car is sold, the scraper scrapes the data to a CSV file. If it's not sold, it continues to the next link and completes the process.
Once the process is once complete, it starts again from the beginning. Scrapes the cars links and then append the links to a list and then from that link, I scrape each car. Now, the downfall of this procedure is that if the script is stoped for any reason, the data stored in the list will also be lost.
So, what would be the best way to store the data so even if the script breaks for any reason, the data is not lost and can be re-accessed when running the script again. I tried to store the links in a text file but when I read the file, after writing it, it doesn't show any stored links.
Below is my code.
print('***Please enter the years range***')
year_from = 2000 # you can change this value.
year_to = 202 # you can change this value.
pause = 8 # will run again after 24 hours.
import requests
from scrapy.selector import Selector
import csv
import re
from time import sleep
import datetime
from random import randint
headers = {
'authority': 'www.pickles.com.au',
'cache-control': 'max-age=0',
'sec-ch-ua': '^\\^Chromium^\\^;v=^\\^92^\\^, ^\\^',
'sec-ch-ua-mobile': '?0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'none',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
'if-modified-since': 'Sun, 29 Aug 2021 20:36:16 GMT',
}
while True:
pickles_links_list = []
live_auctions_api = 'https://www.pickles.com.au/PWR-Web/services/api/sales/future'
api_request = requests.get(url=live_auctions_api, headers=headers)
for auctions in api_request.json():
auction_link = auctions.get('viewSaleListingLink')
if 'cars/item/search/-/listing/listSaleItems/' in auction_link:
auction_request = requests.get(url=auction_link, headers=headers)
response = Selector(text=auction_request.text)
sales_id_re = response.xpath('//script[contains(text(), "Product_Type_Sequence")]/text() | //script[contains(text(), "lot_number_suffix_sequence")]/text()').get()
sales_id = re.findall(r'"Product_Type_Sequence";var n="(.*?)"', sales_id_re) or re.findall(r'"lot_number_suffix_sequence";var n="(.*?)"', sales_id_re)
if sales_id == []:
continue
auction_sale_link = f'https://www.pickles.com.au/v4/caradvert/saleid-{sales_id[0]}-public?count=true&inav=Car%7Cbc%7Cha%7Cu&q=(And.ProductType.Vehicles._.Year.range({year_from}..{year_to}).)&sr=%7Clot_number_suffix_sequence%7C0%7C30'
auction_sale_link_requests = requests.get(url=auction_sale_link, headers=headers)
auctions_data = auction_sale_link_requests.json().get('SearchResults')
if auctions_data == []:
print({"No results for": auction_sale_link_requests.url})
for auction_data in auctions_data:
ids = auction_data.get('TargetId')
main_title = auction_data.get('Title')
link_path = main_title.replace(' ', '-').replace('/', '-').replace(',', '-') + '/' + str(ids)
each_auction_link = f'https://www.pickles.com.au/cars/item/-/details/{link_path}'
pickles_links_list.append(each_auction_link)
print({'Link': each_auction_link})
# going through each link in the text file and checking the results
with open('pickles.csv', 'a+', newline='', encoding='utf-8') as csv_file:
csv_writer = csv.writer(csv_file)
csv_header = [
'Title', 'Make','Model', 'Variant',
'Transmission', 'Odometer', 'State',
'Sale Price', 'Link', 'Sold Date & Time',
'Sold To', 'Condition Report', 'Description',
]
# csv_writer.writerow(csv_header)
unique_links_list = list(set(pickles_links_list))
print('''
###################################
# #
# #
# Now scraping sold items #
# #
# #
###################################
''')
sleep(1)
print({'Total links': f'*** {len(unique_links_list)} ***'})
sleep(3)
for each_link in unique_links_list:
print({'Scraping': each_link})
random_delay = randint(1, 7)
print(f'*** Sleeping for [{random_delay}] seconds ***')
sleep(random_delay)
each_auction_request = requests.get(each_link, headers=headers)
response = Selector(text=each_auction_request.text)
current_status = response.xpath('//h6[@class="mt-2"]/text()[2]').get()
sold_auctions_list = []
if current_status == 'This item has been sold. ' and each_link not in sold_auctions_list:
ids = each_link.split('/')[-1]
title = response.xpath('//div[@class="row"]//h1/text()').get()
description = response.xpath('//td[@itemprop="description"]/text()').get()
condition_report = response.xpath('//a[contains(text(), "Condition Report")]/@href').get()
make = description.split(', ')[1]
model = description.split(', ')[2]
variant = description.split(', ')[3]
transmission = response.xpath('//i[contains(@class, "transmission")]/following-sibling::span/text()').get()
odometer = response.xpath('//i[contains(@class, "mileage")]/following-sibling::span/text()').get()
state = response.xpath('//td[contains(text(), "Location")]/following-sibling::td/text()').get().split(', ')[-1]
# bid history api
bid_history = f'https://www.pickles.com.au/PWR-Web/services/api/bidHistoryService/bidHistory?item={ids}'
sold_item_request = requests.get(url=bid_history, headers=headers)
sold_item_resp = sold_item_request.json()[0]
winning_price = sold_item_resp.get('actualBid')
sold_time_in_ms = sold_item_resp.get('bidTimeInMilliSeconds')
sold_date_time = datetime.datetime.fromtimestamp(sold_time_in_ms / 1000.0, tz=datetime.timezone.utc).isoformat()
sold_to = sold_item_resp.get('bidderAnonName')
auction_values = [
title, make, model, variant, transmission, odometer,
state, "${:,.2f}".format(winning_price).strip() ,
each_auction_request.url, sold_date_time, sold_to,
f'https://www.pickles.com.au{condition_report}', description,
]
csv_writer.writerow(auction_values)
print('*** Sold item found and added to the CSV file ***')
sold_auctions_list.append(each_link)
else:
print('*** This item is not sold yet ***')
continue
Solution
You can use dataframes to keep track of the extracted links and use try catch to save the dataframe in case the script breaks. Here is the sample code.
import pandas as pd
import os
class Scraping_data():
def __init__(self):
self.data_directory = 'your_data_directory'
def load_links(self):
df_links = pd.read_csv('./links_file.csv')
if 'extracted_links.csv' in os.listdir(self.data_directory):
df_extracted = pd.read_csv(os.path.join(self.data_directory, 'extracted_links.csv'))
df_links = df_links[~df_links['links'].isin(df_extracted['links'])]
df_links.reset_index(drop=True, inplace=True)
else:
df_extracted = pd.DataFrame(columns=['links', 'status'])
return df_extracted, df_links
def scrap_data(self):
df_extracted, df_links = self.load_links()
extracted_users = []
try:
for index, row in df_links.iterrows():
#Your Scraping Logic Here.
#row['links'] will give you the current link.
#Upon Successfull extraction of a link.
data_row = {'links': row['link'], 'status': 'extracted'}
extracted_users.append(data_row)
df_extracted = pd.concat([df_extracted, pd.DataFrame(data=extracted_users)], ignore_index=True)
df_extracted.to_csv(os.path.join(self.data_directory, 'extracted_links.csv'), index=False)
except:
df_extracted = pd.concat([df_extracted, pd.DataFrame(data=extracted_users)], ignore_index=True)
df_extracted.to_csv(os.path.join(self.data_directory, 'extracted_links.csv'), index=False)
Answered By - Muhammad Hassan
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.