Issue
I have written code that pulls text from sites and analyses them for readability. However, I sometimes get stuck on sites. Is there any way for me to have my program skip to the next iteration of the for loop if it takes longer than x amount of seconds? If there are any questions or clarifications just let me know in comments
import time
import numpy as np
import pandas as pd
import openpyxl
import reqto as rq
from bs4 import BeautifulSoup
# from SpacySylGetter import *
# import readability
import selenium
from selenium import webdriver
TextIn = pd.read_excel('C:\\Users\\Max von Klemperer\\Desktop\\KeywordLinks\\Aus2.xlsx')
# print(TextIn)
WebURLs = list(TextIn["URL"].values)
Region = list(TextIn["Region"].values)
Keywords = list(TextIn["Keyword"].values)
Rankings = list(TextIn["Ranking"].values)
spaces = 0
syls = 0
counter = 0
characters = 0
sentences = 0
CLIs = []
FL = []
FLAuto = []
WebTexts = []
goodurl = []
goodKW = []
goodRegion = []
goodRanking = []
driver = webdriver.Chrome('C:\\Users\\Max von Klemperer\\Downloads\\chromedriver.exe')
for i in WebURLs:
try:
time.sleep(1)
url = i
driver.get(url)
el = driver.find_element_by_tag_name('body')
initText = el.text
TextPros = ''.join(
filter(lambda x: x in '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPGQRSTUVWXYZ-.?! \n', initText))
cleanedStr = ' '.join(TextPros.split())
print(i)
textToProc = cleanedStr[600:len(cleanedStr) - 600]
textToProc.replace("...", ".")
textToProc.replace("-", " ")
textToProc.replace(".com", " ")
if 1000 < len(textToProc) < 100000:
print(textToProc)
WebTexts.append(textToProc)
goodurl.append(i)
goodRegion.append(Region[counter])
goodKW.append(Keywords[counter])
goodRanking.append(Rankings[counter])
counter = counter + 1
except Exception:
print("Bounced")
for i in WebTexts:
words = len(i.split())
commas = i.count(",")
spaces = i.count(" ")
Hyphens = i.count("-")
# syls = sylsGet(i)
# print(syls)
characters = len(i) - spaces - sentences
sentences = i.count(".") + i.count("?") + i.count("!")
characters = len(i) - spaces - sentences - commas - Hyphens
CLI = ((5.89 * (characters / words)) - (0.296 * sentences / (words / 100))) - 15.8
CLIs.append(CLI)
print(CLI)
# FLK = 206.835 - (1.015 * words / sentences) - (84.6 * syls / words)
# print(FLK)
# FL.append(FLK)
driver.close()
CLIExcel = pd.DataFrame()
toAdd1 = np.array(goodurl)
toAdd2 = np.array(CLIs)
toAdd3 = np.array(goodRegion)
toAdd4 = np.array(goodKW)
toAdd5 = np.array(goodRanking)
# toAdd6 = np.array(FL)
CLIExcel["URL"] = toAdd1
CLIExcel["CLI's"] = toAdd2
CLIExcel["Region"] = toAdd3
CLIExcel["Keyword"] = toAdd4
CLIExcel["Ranking"] = toAdd5
# CLIExcel["Flesch Kinkaid"] = toAdd6
print(CLIExcel)
CLIExcel.to_excel('C:\\Users\\Max von Klemperer\\Desktop\\WorkedCLI.xlsx')
Solution
I was able to fix my issue by reopening the Chrome driver for every 5 sites. It seemed to get stuck if I tried to put to many sites through in one loop. Thanks for the help!
Answered By - MaxVK
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.