Issue
I am building a web crawler as part of my internship and I cannot find the reason I am getting empty lists when I filter by certain attributes. I am trying to scrape names, dates, and locations of conferences.
I have tried several different methods and they;ve all gotten the same result. You can see below all the different methods I've tried:
class AACRSpider(scrapy.Spider):
name = 'aacr'
allowed_domains = ['aacr.org/MEETINGS']
start_urls = ['https://www.aacr.org/MEETINGS/PAGES/EVENTLISTING.ASPX#k=']
def parse(self, response):
soup = BeautifulSoup(response.text, "lxml")
response.css("#Groups a::text").getall() #~ returns []
response.css(".result-event-date::text").getall() #~ returns []
soup.find_all('span', {'class':'result-event-date'}) #~ returns []
soup.find_all(class_="result-event-date") #~ returns []
#....
All 4 of the methods above are returning empty list, instead of returning their text which I know exists on the web-page. As far as I can tell, these tags are not part of an iframe. I am not very familiar with js, is it somehow limiting what my crawler can access?
Solution
You can send xml in xhr as the page is doing to dynamically get that content. It is sent in a POST request as shown below. You then parse out the json which hold the required info.
from bs4 import BeautifulSoup as bs
import requests
import json
import pandas as pd
import re
headers = {
'User-Agent' : 'Mozilla/5.0',
'X-RequestDigest' : '0x34AF663EB174C6B490674D6D041668FE4928C0A60754D7CAB6C39B120504145C2DD53F657F7D82D78D6EFFDE3ADEDC210520BB12197AC1966ACD9562B94EB096,07 Jun 2019 20:18:44 -0000',
'X-Requested-With': 'XMLHttpRequest'
}
xml = '''<Request xmlns="http://schemas.microsoft.com/sharepoint/clientquery/2009" SchemaVersion="15.0.0.0" LibraryVersion="15.0.0.0" ApplicationName="Javascript Library"><Actions><ObjectPath Id="1" ObjectPathId="0" /><SetProperty Id="2" ObjectPathId="0" Name="TimeZoneId"><Parameter Type="Number">10</Parameter></SetProperty><ObjectPath Id="4" ObjectPathId="3" /><Method Name="Add" Id="5" ObjectPathId="3"><Parameters><Parameter Type="String">RefinableDate03</Parameter><Parameter Type="Number">0</Parameter></Parameters></Method><SetProperty Id="6" ObjectPathId="0" Name="Culture"><Parameter Type="Number">-1</Parameter></SetProperty><SetProperty Id="7" ObjectPathId="0" Name="RowsPerPage"><Parameter Type="Number">10</Parameter></SetProperty><SetProperty Id="8" ObjectPathId="0" Name="RowLimit"><Parameter Type="Number">10</Parameter></SetProperty><SetProperty Id="9" ObjectPathId="0" Name="TotalRowsExactMinimum"><Parameter Type="Number">11</Parameter></SetProperty><SetProperty Id="10" ObjectPathId="0" Name="SourceId"><Parameter Type="Guid">{ec6b6718-a344-4f42-9942-f6c673ab1089}</Parameter></SetProperty><ObjectPath Id="12" ObjectPathId="11" /><Method Name="SetQueryPropertyValue" Id="13" ObjectPathId="11"><Parameters><Parameter Type="String">SourceName</Parameter><Parameter TypeId="{b25ba502-71d7-4ae4-a701-4ca2fb1223be}"><Property Name="BoolVal" Type="Boolean">false</Property><Property Name="IntVal" Type="Number">0</Property><Property Name="QueryPropertyValueTypeIndex" Type="Number">1</Property><Property Name="StrArray" Type="Null" /><Property Name="StrVal" Type="String">AACR Event List Source</Property></Parameter></Parameters></Method><Method Name="SetQueryPropertyValue" Id="14" ObjectPathId="11"><Parameters><Parameter Type="String">SourceLevel</Parameter><Parameter TypeId="{b25ba502-71d7-4ae4-a701-4ca2fb1223be}"><Property Name="BoolVal" Type="Boolean">false</Property><Property Name="IntVal" Type="Number">0</Property><Property Name="QueryPropertyValueTypeIndex" Type="Number">1</Property><Property Name="StrArray" Type="Null" /><Property Name="StrVal" Type="String">SPWeb</Property></Parameter></Parameters></Method><SetProperty Id="15" ObjectPathId="0" Name="Refiners"><Parameter Type="String">RefinableString59(filter=15/0/*),RefinableString58(filter=15/0/*)</Parameter></SetProperty><ObjectPath Id="17" ObjectPathId="16" /><Method Name="Add" Id="18" ObjectPathId="16"><Parameters><Parameter Type="String">Title</Parameter></Parameters></Method><Method Name="Add" Id="19" ObjectPathId="16"><Parameters><Parameter Type="String">Path</Parameter></Parameters></Method><Method Name="Add" Id="20" ObjectPathId="16"><Parameters><Parameter Type="String">Author</Parameter></Parameters></Method><Method Name="Add" Id="21" ObjectPathId="16"><Parameters><Parameter Type="String">SectionNames</Parameter></Parameters></Method><Method Name="Add" Id="22" ObjectPathId="16"><Parameters><Parameter Type="String">SiteDescription</Parameter></Parameters></Method><SetProperty Id="23" ObjectPathId="0" Name="TrimDuplicates"><Parameter Type="Boolean">false</Parameter></SetProperty><Method Name="SetQueryPropertyValue" Id="24" ObjectPathId="11"><Parameters><Parameter Type="String">ListId</Parameter><Parameter TypeId="{b25ba502-71d7-4ae4-a701-4ca2fb1223be}"><Property Name="BoolVal" Type="Boolean">false</Property><Property Name="IntVal" Type="Number">0</Property><Property Name="QueryPropertyValueTypeIndex" Type="Number">1</Property><Property Name="StrArray" Type="Null" /><Property Name="StrVal" Type="String">4dfaa8e2-a519-4988-b774-f81961091dba</Property></Parameter></Parameters></Method><Method Name="SetQueryPropertyValue" Id="25" ObjectPathId="11"><Parameters><Parameter Type="String">ListItemId</Parameter><Parameter TypeId="{b25ba502-71d7-4ae4-a701-4ca2fb1223be}"><Property Name="BoolVal" Type="Boolean">false</Property><Property Name="IntVal" Type="Number">4</Property><Property Name="QueryPropertyValueTypeIndex" Type="Number">2</Property><Property Name="StrArray" Type="Null" /><Property Name="StrVal" Type="Null" /></Parameter></Parameters></Method><SetProperty Id="26" ObjectPathId="0" Name="ResultsUrl"><Parameter Type="String">https://www.aacr.org/MEETINGS/PAGES/EVENTLISTING.ASPX#k=</Parameter></SetProperty><SetProperty Id="27" ObjectPathId="0" Name="ClientType"><Parameter Type="String"></Parameter></SetProperty><Method Name="SetQueryPropertyValue" Id="28" ObjectPathId="11"><Parameters><Parameter Type="String">QuerySession</Parameter><Parameter TypeId="{b25ba502-71d7-4ae4-a701-4ca2fb1223be}"><Property Name="BoolVal" Type="Boolean">false</Property><Property Name="IntVal" Type="Number">0</Property><Property Name="QueryPropertyValueTypeIndex" Type="Number">1</Property><Property Name="StrArray" Type="Null" /><Property Name="StrVal" Type="String">75323e1b-fbc8-437d-a884-9474eb16e68a</Property></Parameter></Parameters></Method><SetProperty Id="29" ObjectPathId="0" Name="ProcessPersonalFavorites"><Parameter Type="Boolean">false</Parameter></SetProperty><SetProperty Id="30" ObjectPathId="0" Name="SafeQueryPropertiesTemplateUrl"><Parameter Type="String">querygroup://webroot/PAGES/EVENTLISTING.ASPX?groupname=Default</Parameter></SetProperty><SetProperty Id="31" ObjectPathId="0" Name="IgnoreSafeQueryPropertiesTemplateUrl"><Parameter Type="Boolean">false</Parameter></SetProperty><ObjectPath Id="33" ObjectPathId="32" /><ExceptionHandlingScope Id="34"><TryScope Id="36"><Method Name="ExecuteQueries" Id="38" ObjectPathId="32"><Parameters><Parameter Type="Array"><Object Type="String">b759b507-ba34-499a-b350-9478f1deb96cDefault</Object></Parameter><Parameter Type="Array"><Object ObjectPathId="0" /></Parameter><Parameter Type="Boolean">true</Parameter></Parameters></Method></TryScope><CatchScope Id="40" /></ExceptionHandlingScope></Actions><ObjectPaths><Constructor Id="0" TypeId="{80173281-fffd-47b6-9a49-312e06ff8428}" /><Property Id="3" ParentId="0" Name="SortList" /><Property Id="11" ParentId="0" Name="Properties" /><Property Id="16" ParentId="0" Name="HitHighlightedProperties" /><Constructor Id="32" TypeId="{8d2ac302-db2f-46fe-9015-872b35f15098}" /></ObjectPaths></Request>'''
r = requests.post('https://www.aacr.org/Meetings/_vti_bin/client.svc/ProcessQuery', data = xml, headers = headers)
soup = BeautifulSoup(r.content, 'lxml')
data = json.loads(soup.select_one('p').text)
results = []
for item in data[12]['b759b507-ba34-499a-b350-9478f1deb96cDefault']['ResultTables'][0]['ResultRows']:
title = item['Title']
start = re.search(r'(\d+)', item['MeetingStartDate']).groups(0)[0]
end = re.search(r'(\d+)', item['MeetingEndDate']).groups(0)[0]
location = item['EventCityStOWSTEXT']
start = pandas.to_datetime(start,unit='ms')
end = pandas.to_datetime(end,unit='ms')
row = [title, start, end, location]
results.append(row)
pd.options.display.max_columns = 4
pd.set_option('display.width', 1000)
df = pd.DataFrame(results, columns = ['Title', 'Start', 'End', 'Location'])
print(df)
Answered By - QHarr
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.