Issue
In this program i am not using request or beautiful soup function. I'm instead only using the datetime to extract the URLs. Now in the current program, I have written to extract the values for a long period. I want to make it in such a way that, if I automate this program and it runs today, it will extract yesterday's data. Similarly if it runs tomorrow, it will extract todays data and so on.
here is the code,
import datetime
from datetime import date, datetime,timedelta
import warnings
import datetime
import pandas as pd
import wget
import glob
import os
warnings.filterwarnings("ignore")
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from urllib.error import HTTPError
def date_range(start_date,end_date):
for n in range(int((end_date-start_date).days)):
yield start_date + timedelta(n)
def get_urls(base_url):
part_two = "/dailyCoal1-"
end_part = ".xlsx"
start_date = date(2020,11,1)
end_date = datetime.datetime.now().date()
start_urls = list()
for single_date in date_range(start_date, end_date):
start_urls.append(single_date.strftime(base_url+'%d-%m-%Y'+part_two+'%Y-%m-%d'+end_part))
return start_urls
def excel_download(link,out):
#downloads a given link provided to a output directory in out
wget.download(link,out)
if __name__ =="__main__":
base_url = "https://npp.gov.in/public-reports/cea/daily/fuel/"
mypath = "/Users/vp/Desktop/temp"
temp_folder = '/Users/vp/Desktop/temp'
out_folder = "/Users/vp/Desktop/NPP"
log_file = os.path.join(out_folder,'debug_log_npp.log')
out_file = os.path.join(out_folder,'Energy_inputs_npp.csv')
file_links = get_urls(base_url)
for link in file_links:
try:
excel_download(link,temp_folder)
except HTTPError:
content = "HTTP issue while capturing data for this link - " + link
log_writer(log_file,content)
continue
file = glob.glob(os.path.join(temp_folder,'*.xlsx'),recursive=True)[0]
df = pd.read_excel(file)
To capture yesterday's data, i created this in the main function where i check for yesterday = and then cancel if it isnt yesterday. But then its throwing error as it constantly picks the start date as its day one.
if(date_time_obj != Yesterday):
os.remove(file)
content = "Date mis-matched - " + str(date_time_obj) + " " + str(Yesterday)
In this program, date_time_obj - is the date it is currently trying to extract data for.
Everyday if this program runs at 8pm, it needs to only capture one day before data on a daily basis.
if this cannot be done in datetime, but only on request or bs4, then how do i approach this problem?
Solution
I don't know if you wanted a valid link as your code doesn't seem to produce those for me but you only need to tweak to work off start_date only and return a single item to return yesterday's link matching with your current output for same date.
import datetime
from datetime import date, datetime,timedelta
import warnings
import datetime
import pandas as pd
import glob
import os
warnings.filterwarnings("ignore")
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from urllib.error import HTTPError
def get_url(base_url):
part_two = "/dailyCoal1-"
end_part = ".xlsx"
start_date = datetime.datetime.now().date() + timedelta(-1)
start_url = start_date.strftime(base_url+'%d-%m-%Y'+part_two+'%Y-%m-%d'+end_part)
return start_url
def excel_download(link,out):
#downloads a given link provided to a output directory in out
wget.download(link,out)
if __name__ =="__main__":
base_url = "https://npp.gov.in/public-reports/cea/daily/fuel/"
mypath = "/Users/vp/Desktop/temp"
temp_folder = '/Users/vp/Desktop/temp'
out_folder = "/Users/vp/Desktop/NPP"
log_file = os.path.join(out_folder,'debug_log_npp.log')
out_file = os.path.join(out_folder,'Energy_inputs_npp.csv')
file_link = get_url(base_url)
print(file_link)
Answered By - QHarr
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.