Issue
I would like to scrape all link for the formation in this website : https://www.formatic-centre.fr/formation/
Apparently the next pages are dynamically loaded with AJAX. I need to simulate those requests using FormRequest from scrapy.
That was I did, I look up for the parameters with developer tools : ajax1
I put those parameters into FormRequest
but apparently if it didn't work, I need to include the header, that what I did : ajax2
But it didn't work either.. I'm guessing I'm doing something wrong but what ?
Here's my script, if you want (sorry it's quite long, because I put all the parameters and the headers) :
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from lxml import html
from scrapy.http import FormRequest
class LinkSpider(scrapy.Spider):
name = "link"
#allow_domains = ['https://www.formatic-centre.fr/']
start_urls = ['https://www.formatic-centre.fr/formation/']
rules = (Rule(LinkExtractor(allow=r'formation'), callback="parse", follow= True),)
def parse(self, response):
card = response.xpath('//a[@class="title"]')
for a in card:
yield {'links': a.xpath('@href').get()}
return [FormRequest(url="https://www.formatic-centre.fr/formation/",
formdata={'action' : "swlabscore",
'module[0]' : "top.Top_Controller",
'module[1]' : "ajax_get_course_pagination",
'page' : "2",
'layout' : "course",
'limit_post' : "",
'offset_post' : "0",
'sort_by' : "",
'pagination' : "yes",
'location_slug' : "",
'columns' : "2",
'paged' : "",
'cur_limit' : "",
'rows': "0",
'btn_content' : "En+savoir+plus",
'uniq_id' : "block-13759488265f916bca45c89",
'ZmfUNQ': "63y[Jt",
'PmhpIuZ_cTnUxqg' : "7v@IahmJNMplbCu",
'cZWVDbSPzTXRe' : "n9oa2k5u4GHWm",
'eOBITfdGRuriQ' : "hBPN5nObe.ktH",
"Accept" : "*/*",
"Accept-Encoding" : "gzip, deflate, br",
"Accept-Language" : "fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3",
"Connection" : "keep-alive",
"Content-Length" : "1010",
"Content-Type" : "application/x-www-form-urlencoded; charset=UTF-8",
"Cookie" : "_ga=GA1.2.815964309.1603392091; _gid=GA1.2.1686929506.1603392091; jlFYkafUWiyJe=LGAWcXg_wUjFo; z-byDgTnkdcQJSNH=03d1yiqH%40h8uZNtw; YeAhrFumyo-HQwpn=5uOhD6viWy%5BYeq3o",
"Host" : "www.formatic-centre.fr",
"Origin" : "https://www.formatic-centre.fr",
"Referer" : "https://www.formatic-centre.fr/formation/",
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0",
"X-Requested-With" : "XMLHttpRequest",
"access-control-allow-credentials" : "true",
"access-control-allow-origin" : "https://www.formatic-centre.fr",
"cache-control" : "no-cache, must-revalidate, max-age=0",
"content-encoding": "gzip",
"content-length" :"2497",
"content-type" :"text/html; charset=UTF-8",
"date" :"Thu, 22 Oct 2020 18:42:54 GMT",
"expires" :"Wed, 11 Jan 1984 05:00:00 GMT",
"referrer-policy": "strict-origin-when-cross-origin",
"server": "Apache",
"set-cookie" : "jlFYkafUWiyJe=LGAWcXg_wUjFo; expires=Fri, 23-Oct-2020 18:42:54 GMT; Max-Age=86400; path=/; secure",
"set-cookie" : "z-byDgTnkdcQJSNH=03d1yiqH%40h8uZNtw; expires=Fri, 23-Oct-2020 18:42:54 GMT; Max-Age=86400; path=/; secure",
"set-cookie" : "YeAhrFumyo-HQwpn=5uOhD6viWy%5BYeq3o; expires=Fri, 23-Oct-2020 18:42:54 GMT; Max-Age=86400; path=/; secure",
"strict-transport-security" : "max-age=15552001; preload",
"vary" : "Accept-Encoding",
"x-content-type-options" : "nosniff",
"X-Firefox-Spdy" : "h2",
"x-frame-options" : "SAMEORIGIN",
"x-robots-tag" : "noindex"})]
The script work for the first page, I obtained the links, but when he need to use FormRequest, nothing happened and I can't obtained the link in the next pages.
Any ideas ?
EDIT : I didn't see it but the terminal tell me this error :
2020-10-23 03:51:30 [scrapy.core.engine] DEBUG: Crawled (400) <POST https://www.formatic-centre.fr/formation/> (referer: https://www.formatic-centre.fr/formation/) ['partial']
2020-10-23 03:51:30 [scrapy.spidermiddlewares.httperror] INFO: Ignoring response <400 https://www.formatic-centre.fr/formation/>: HTTP status code is not handled or not allowed
Maybe it could help ?
Solution
You have some issues with how you format and send both your headers
and the payload
itself.
Also, you have to keep changing the page, so the server knows where you're at and what response to send back.
I didn't want to set up a new scrapy
project but here's how I got all the links, so hopefully this will nudge you in the right direction:
And if it feels like a hack, well, because it is one.
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup
headers = {
"accept": "*/*",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"origin": "https://www.formatic-centre.fr",
"referer": "https://www.formatic-centre.fr/formation/",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.99 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
}
raw_string = "action=swlabscore&module%5B%5D=top.Top_Controller&module%5B%5D=ajax_get_course_pagination¶ms%5B0%5D%5Bpage%5D=2¶ms%5B0%5D%5Batts%5D%5Blayout%5D=course¶ms%5B0%5D%5Batts%5D%5Blimit_post%5D=¶ms%5B0%5D%5Batts%5D%5Boffset_post%5D=0¶ms%5B0%5D%5Batts%5D%5Bsort_by%5D=¶ms%5B0%5D%5Batts%5D%5Bpagination%5D=yes¶ms%5B0%5D%5Batts%5D%5Blocation_slug%5D=¶ms%5B0%5D%5Batts%5D%5Bcolumns%5D=2¶ms%5B0%5D%5Batts%5D%5Bpaged%5D=¶ms%5B0%5D%5Batts%5D%5Bcur_limit%5D=¶ms%5B0%5D%5Batts%5D%5Brows%5D=0¶ms%5B0%5D%5Batts%5D%5Bbtn_content%5D=En+savoir+plus¶ms%5B0%5D%5Batts%5D%5Buniq_id%5D=block-13759488265f916bca45c89¶ms%5B0%5D%5Batts%5D%5Bthumb-size%5D%5Blarge%5D=swedugate-thumb-300x225¶ms%5B0%5D%5Batts%5D%5Bthumb-size%5D%5Bno-image%5D=thumb-300x225.gif¶ms%5B0%5D%5Batts%5D%5Bthumb-size%5D%5Bsmall%5D=swedugate-thumb-300x225¶ms%5B0%5D%5Blayout_course%5D=style-grid&ZmfUNQ=63y[Jt&PmhpIuZ_cTnUxqg=7v@IahmJNMplbCu&cZWVDbSPzTXRe=n9oa2k5u4GHWm&eOBITfdGRuriQ=hBPN5nObe.ktH"
payloadd = [
('action', 'swlabscore'),
('module[]', 'top.Top_Controller'),
('module[]', 'ajax_get_course_pagination'),
('params[0][page]', '1'),
('params[0][atts][layout]', 'course'),
('params[0][atts][offset_post]', '0'),
('params[0][atts][pagination]', 'yes'),
('params[0][atts][columns]', '2'),
('params[0][atts][rows]', '0'),
('params[0][atts][btn_content]', 'En savoir plus'),
('params[0][atts][uniq_id]', 'block-13759488265f916bca45c89'),
('params[0][atts][thumb-size][large]', 'swedugate-thumb-300x225'),
('params[0][atts][thumb-size][no-image]', 'thumb-300x225.gif'),
('params[0][atts][thumb-size][small]', 'swedugate-thumb-300x225'),
('params[0][layout_course]', 'style-grid'),
('ZmfUNQ', '63y[Jt'),
('PmhpIuZ_cTnUxqg', '7v@IahmJNMplbCu'),
('cZWVDbSPzTXRe', 'n9oa2k5u4GHWm'),
('eOBITfdGRuriQ', 'hBPN5nObe.ktH'),
]
all_links = []
for page in range(1, 10):
payloadd.pop(3)
payloadd.insert(3, ('params[0][page]', str(page)))
response = requests.post(
"https://www.formatic-centre.fr/wp-admin/admin-ajax.php?",
headers=headers,
data=urlencode(payloadd)
)
print(f"Getting links from page {page}...")
soup = BeautifulSoup(response.text, "html.parser").find_all("a", class_="btn btn-green")
links = [i["href"] for i in soup]
print('\n'.join(links))
all_links.extend(links)
with open("formatic-center_links.txt", "w") as f:
f.writelines("\n".join(all_links) + "\n")
This produces a file with all the links under the EN SAVOIR PLUS
buttons.
https://www.formatic-centre.fr/formation/les-regles-juridiques-du-teletravail/
https://www.formatic-centre.fr/formation/mieux-gerer-son-stress-en-periode-du-covid-19/
https://www.formatic-centre.fr/formation/dynamiser-vos-equipes-special-post-confinement/
https://www.formatic-centre.fr/formation/conduire-ses-entretiens-specifique-post-confinement/
https://www.formatic-centre.fr/formation/cours-excel/
https://www.formatic-centre.fr/formation/autocad-3d-2/
https://www.formatic-centre.fr/formation/concevoir-et-developper-une-strategie-marketing/
https://www.formatic-centre.fr/formation/preparer-soutenance/
https://www.formatic-centre.fr/formation/mettre-en-place-une-campagne-adwords/
https://www.formatic-centre.fr/formation/utiliser-google-analytics/
and so on ...
Answered By - baduker
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.