Issue
I am trying to get Indonesian Dictionary from KBBI for NLP research purposes, I noticed that it is a protected page, and need to do authentication login first, here is the snippets I use with Scrapy Python
import scrapy
import re
import pandas as pd
from scrapy.http import FormRequest
from scrapy import Request
class scrape_kamus_kbbi(scrapy.Spider):
name = "kamus_kbbi"
list_url = []
ALP = [chr(x) for x in range(65, 91)]
sub_directory = "KBBI_FULL_HURUF"
page_set = [142, 232, 47, 76, 38, 23, 75, 50, 43, 44,
239, 85, 343, 32, 26, 274, 1, 69, 195, 178,
30, 11, 17, 2, 5, 7]
login_url = "https://kbbi.kemdikbud.go.id/Account/Login?ReturnUrl"
username = "myusername"
password = "mypassword"
full_directory = "C:/Users/User/Desktop/Data Science Journey/Data Science with Python/Crawling Script/Indonesian Words/" + sub_directory + ".csv"
for h in range(26):
for g in range(1,(page_set[h]+1)):
text_url = "https://kbbi.kemdikbud.go.id/Cari/Alphabet?masukan=" + str(ALP[h]) + "&masukanLengkap=" + str(ALP[h]) + "&page" + str(g)
list_url.append(text_url)
start_urls = [login_url]
def __init__(self):
self.words=[]
def parse(self, response):
self.log("Login page... Posting username & password")
formdata = {'Username': self.username, 'Password': self.password}
return FormRequest.from_response(response, formdata=formdata,
callback=self.after_login)
def after_login(self, response):
for i in range(len(self.list_url)):
yield Request(self.list_url[i], self.parse_page)
def parse_page(self, response):
self.log("Logged in... Grab All KBBI Words...")
kata = response.xpath('.//div[@class="col-md-3"]/a/text()').extract()
for x in range(len(kata)):
self.words.append(kata[x])
kumpulan_kata = pd.DataFrame(self.words, columns=["Kata"])
kumpulan_kata.to_csv(self.full_directory)
from scrapy import cmdline
cmdline.execute("scrapy runspider scapre_kbbi_kemdikbug.py".split())
but I still get redirected code 302
Redirecting (302) to <GET https://kbbi.kemdikbud.go.id/Account/Login?ReturnUrl=%2FCari%2FAlphabet%3Fmasukan%3DG%26masukanLengkap%3DG%26page29> from <GET https://kbbi.kemdikbud.go.id/Cari/Alphabet?masukan=G&masukanLengkap=G&page29>
I didnt know what goes wrong at this point. can anyone point a lead?
Solution
The formdata you pass to the request is wrong.
Should look like this:
formdata = {
'Posel': self.username,
'KataSandi': self.password
}
The correct request's params are these
But you instead send:
{
"__RequestVerificationToken": "some_token",
"Posel": "",
"KataSandi": "",
"IngatSaya": false,
"Username": "{self.username}",
"Password": "{self.password}"
}
Answered By - basckerwil
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.