Issue
I'm new to data scraping and, recently, I was trying to scrape data from wunderground.com by selenium library with python. However, I found that, sometimes, the selenium web driver cannot successfully open the webpage, I thought this issue may be somewhat related to the JavaScript the website used but not sure which parts went wrong. Does anyone know how to solve it? Thanks in advance.
Here is the example for correctly showing: example for correctly showing
Here shows the problematic one: example for problematic one
My code is here, which is a very simple selenium calls
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver import ChromeOptions
from selenium.webdriver import ActionChains
import time
# url for scraping
url = "https://www.wunderground.com/history/daily/us/ca/san-diego/KSAN/date/2021-2-1"
# define properties of selenium webdriver
option = webdriver.ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_experimental_option('useAutomationExtension', False)
option.add_experimental_option( "prefs",{'profile.managed_default_content_settings.javascript': 1}) #value 1 enables it , if you set to 2 it disables it
option.add_argument('--disable-gpu')
option.add_argument("--disable-blink-features")
option.add_argument("--disable-blink-features=AutomationControlled")
option.add_argument("--enable-javascript")
driver = webdriver.Chrome(options=option)
driver.get(url)
time.sleep(5) # wait for webpage loading
Solution
The page sends HTTP GET to: https://api.weather.com/v1/location/KSAN:9:US/observations/historical.json?apiKey=e1f10a1e78da46f5b10a1e78da96f525&units=e&startDate=20210201
The response to this call is a huge JSON that contains the data you are looking for. (below is a subset)
{
"metadata": {
"language": "en-US",
"transaction_id": "1631220781880:2112944028",
"version": "1",
"location_id": "KSAN:9:US",
"units": "e",
"expire_time_gmt": 1631224381,
"status_code": 200
},
"observations": [
{
"key": "KSAN",
"class": "observation",
"expire_time_gmt": 1612176660,
"obs_id": "KSAN",
"obs_name": "San Diego",
"valid_time_gmt": 1612169460,
"day_ind": "N",
"temp": 59,
"wx_icon": 27,
"icon_extd": 2700,
"wx_phrase": "Mostly Cloudy",
"pressure_tend": 2,
"pressure_desc": "Falling",
"dewPt": 45,
"heat_index": 59,
"rh": 60,
"pressure": 30.04,
"vis": 10,
"wc": 59,
"wdir": null,
"wdir_cardinal": "CALM",
"gust": null,
"wspd": 0,
"max_temp": null,
"min_temp": null,
"precip_total": null,
"precip_hrly": 0,
"snow_hrly": null,
"uv_desc": "Low",
"feels_like": 59,
"uv_index": 0,
"qualifier": null,
"qualifier_svrty": null,
"blunt_phrase": null,
"terse_phrase": null,
"clds": "BKN",
"water_temp": null,
"primary_wave_period": null,
"primary_wave_height": null,
"primary_swell_period": null,
"primary_swell_height": null,
"primary_swell_direction": null,
"secondary_swell_period": null,
"secondary_swell_height": null,
"secondary_swell_direction": null
},
{
"key": "KSAN",
"class": "observation",
"expire_time_gmt": 1612180260,
"obs_id": "KSAN",
"obs_name": "San Diego",
"valid_time_gmt": 1612173060,
"day_ind": "N",
"temp": 59,
"wx_icon": 27,
"icon_extd": 2700,
"wx_phrase": "Mostly Cloudy",
"pressure_tend": null,
"pressure_desc": null,
"dewPt": 47,
"heat_index": 59,
"rh": 64,
"pressure": 30.04,
"vis": 10,
"wc": 59,
"wdir": 260,
"wdir_cardinal": "W",
"gust": null,
"wspd": 5,
"max_temp": null,
"min_temp": null,
"precip_total": null,
"precip_hrly": 0,
"snow_hrly": null,
"uv_desc": "Low",
"feels_like": 59,
"uv_index": 0,
"qualifier": null,
"qualifier_svrty": null,
"blunt_phrase": null,
"terse_phrase": null,
"clds": "BKN",
"water_temp": null,
"primary_wave_period": null,
"primary_wave_height": null,
"primary_swell_period": null,
"primary_swell_height": null,
"primary_swell_direction": null,
"secondary_swell_period": null,
"secondary_swell_height": null,
"secondary_swell_direction": null
} ]
Answered By - balderman
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.