Issue
I am not familiar with multithreading and how I can apply it to scrape the data fast because beautifulsoup scrape the data slow can tell how I apply multithreading to my code this is the page link https://baroul-timis.ro/tabloul-avocatilor/
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = "https://baroul-timis.ro/get-av-data?param=toti-avocatii"
base_url= 'https://baroul-timis.ro'
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
productlink=[]
data = requests.get(url).json()
for i, d in enumerate(data["data"], 1):
link = BeautifulSoup(d["actions"], "html.parser").a["href"]
comp=base_url+link
productlink.append(comp)
test=[]
for link in productlink:
wev={}
r =requests.get(link,headers=headers)
soup=BeautifulSoup(r.content, 'html.parser')
prod=soup.find_all('div',class_='user-info text-left mb-50')
for pip in prod:
title=pip.find('h4').text
wev['title']=title
try:
phone=pip.select('span',class_='font-weight-bolder')[2].text
except:
pass
wev['phone']=phone.split('\xa0')
try:
email=pip.select('span',class_='font-weight-bolder')[3].text
except:
pass
wev['email']=email.split('\xa0')
test.append(wev)
df = pd.DataFrame(test)
print(df)
Solution
Multithreading is ideal for this kind of thing because there will be lots of I/O waits while the URLs are accessed and their data acquired. Here's how you could re-work it:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
url = "https://baroul-timis.ro/get-av-data?param=toti-avocatii"
base_url= 'https://baroul-timis.ro'
headers ={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
test = []
def process(link):
wev={}
r =requests.get(link,headers=headers)
soup=BeautifulSoup(r.content, 'lxml')
prod=soup.find_all('div',class_='user-info text-left mb-50')
for pip in prod:
title=pip.find('h4').text
wev['title']=title
try:
wev['phone']=pip.select('span',class_='font-weight-bolder')[2].text.split('\xa0')
except:
pass
try:
wev['email']=pip.select('span',class_='font-weight-bolder')[3].text.split('\xa0')
except:
pass
test.append(wev)
productlink=[]
data = requests.get(url).json()
for d in data["data"]:
link = BeautifulSoup(d["actions"], "lxml").a["href"]
productlink.append(base_url+link)
with ThreadPoolExecutor() as executor:
executor.map(process, productlink)
df = pd.DataFrame(test)
print(df)
This generates a 941 row dataframe in <44 seconds on my system (24 threads) - i.e., ~20 URLs/second
Note: If you don't already have lxml installed, you'll need it. It's generally faster than html.parser
EDIT:
Multiprocessing version
import requests
from bs4 import BeautifulSoup
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
url = "https://baroul-timis.ro/get-av-data?param=toti-avocatii"
base_url = 'https://baroul-timis.ro'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'
}
def process(link):
wev = {}
test = []
r = requests.get(link, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
prod = soup.find_all('div', class_='user-info text-left mb-50')
for pip in prod:
wev['title'] = pip.find('h4').text
try:
wev['phone'] = pip.select('span', class_='font-weight-bolder')[2].text.split('\xa0')
except:
pass
try:
wev['email'] = pip.select('span', class_='font-weight-bolder')[3].text.split('\x0a')
except:
pass
test.append(wev)
return test
def main():
productlink = []
for d in requests.get(url).json()["data"]:
link = BeautifulSoup(d["actions"], "lxml").a["href"]
productlink.append(base_url+link)
test = []
with ProcessPoolExecutor() as executor:
for r in executor.map(process, productlink):
test.extend(r)
df = pd.DataFrame(test)
print(df)
if __name__ == '__main__':
main()
Answered By - Albert Winestein
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.