Issue
I'm trying to get some datas from a javascript webpage. My code is generating multiple links and parsing them one by one. Parsing outputs are lists. I have written this code with help from here. But it produces the lists inside a class. I want to insert list items into an sqlite table, and because of this I want to make the local list items global. I've tried to create a global list, put it into the class, and then append to it and return it. I've tried to directly insert them into the database from the processCurrentPage
method and tried to create a list under the class and reach it by Webpage.list. But none of these methods worked. One of my attempts is here, but not the best one - it's only an example. I've tried many alternatives like this. Can you suggest a good way to handle it please?
P.S: I am new at Python, but researching it for whole two days, and read all class documentation, but couldn't find a way.
import sys
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
import requests
from bs4 import BeautifulSoup
import bs4 as bs
class WebPage(QtWebEngineWidgets.QWebEnginePage):
alldatas=[]
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext
@property
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
url = self.url().toString()
# do stuff with html...
soup = bs.BeautifulSoup(html, 'html.parser')
data = soup.find('div', class_='tablo_dual_board')
data1 = data.text
data2 = data1.splitlines()
self.alldatas+=data2
if not self.fetchNext:
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def javaScriptConsoleMessage(self, QWebEnginePage_JavaScriptConsoleMessageLevel, p_str, p_int, p_str_1):
# disable javascript error output
pass
if __name__ == '__main__':
# generate some test urls
onexurl = "https://1xbahis1.com/en/live/Football/"
r = requests.get(onexurl)
soup = BeautifulSoup(r.content, "html.parser")
income = soup.find_all("ul", {"id":"games_content"})
links = soup.find_all("a", {"class": "c-events__name"})
urls = []
for matchlink in links:
urls.append("https://1xbahis1.com/en/"+(matchlink.get("href")))
app = QtWidgets.QApplication(sys.argv)
webpage = WebPage()
webpage.start(urls)
print(webpage.alldatas)
sys.exit(app.exec_())
Solution
Below is a version of your script that should do what you want. The scrape_page
function is called for each url that is processed, and the data is added to a global records
list. The process_records
function is called once after all the pages have been scraped. You can use this function to add the records to your database.
import sys
import requests
from bs4 import BeautifulSoup
from PyQt5 import QtCore, QtWidgets, QtWebEngineWidgets
records = []
def scrape_page(url, html):
print('scrape page:', url)
soup = BeautifulSoup(html, 'html.parser')
data = soup.find('div', class_='tablo_dual_board')
if data is not None:
records.append(data.text.splitlines())
else:
print('error: could not find tablo_dual_board')
def process_records():
# add record to database ...
print('process records:', len(records))
def generate_urls():
onexurl = "https://1xbahis1.com/en/live/Football/"
reply = requests.get(onexurl)
soup = BeautifulSoup(reply.content, "html.parser")
income = soup.find_all("ul", {"id":"games_content"})
links = soup.find_all("a", {"class": "c-events__name"})
urls = []
for matchlink in links:
urls.append("https://1xbahis1.com/en/"+(matchlink.get("href")))
return urls
class WebPage(QtWebEngineWidgets.QWebEnginePage):
def __init__(self):
super(WebPage, self).__init__()
self.loadFinished.connect(self.handleLoadFinished)
def start(self, urls):
self._urls = iter(urls)
self.fetchNext()
def fetchNext(self):
try:
url = next(self._urls)
except StopIteration:
return False
else:
self.load(QtCore.QUrl(url))
return True
def processCurrentPage(self, html):
scrape_page(self.url().toString(), html)
if not self.fetchNext():
process_records()
QtWidgets.qApp.quit()
def handleLoadFinished(self):
self.toHtml(self.processCurrentPage)
def javaScriptConsoleMessage(self, QWebEnginePage_JavaScriptConsoleMessageLevel, p_str, p_int, p_str_1):
# disable javascript error output
pass
if __name__ == '__main__':
app = QtWidgets.QApplication(sys.argv)
webpage = WebPage()
webpage.start(generate_urls())
sys.exit(app.exec_())
Answered By - ekhumoro
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.