Sunday, September 25, 2022

[FIXED] Scrapy error "twisted.web._newclient.RequestGenerationFailed"

September 25, 2022 pyinstaller, python, scrapy No comments

Issue

In Pycharm venv, I have created a gui with pyinstaller, which calls scrapy on a file with Popen. From terminal Popen was calling scrapy and scraping was being done successfully. After packing gui was being opened but stderr of Popen was telling scrapy not found. A issue I opened on Githubissue helped me to find out that pyinstallerwas using the user package instead of venv. I solved it by installing scrapy for the user and after packing with pyinstaller the built gui is calling scrapy. Still don't know why pyinstaller didn't use venv package.

But now I am getting scrapy error

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/twisted/internet/defer.py", line 62, in run
    return f(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/scrapy/core/downloader/middleware.py", line 49, in process_request
    return (yield download_func(request=request, spider=spider))
twisted.web._newclient.RequestGenerationFailed: [<twisted.python.failure.Failure builtins.AttributeError: __enter__>]

Even if I run scrapy as user (outside venv), in terminal I get this error. Inside venv running in terminal things are fine.

Also is it the ideal way to pack from pyinstaller i.e. installing packages in user as well, so that pyinstaller can get them?

Gui code

import tkinter as tk
from tkinter import messagebox as tkms
from tkinter import ttk
import shlex
from subprocess import Popen
import json


def get_url():
    #printing Entry url to a file


harvest = None


def watch():
    global harvest
    if harvest:
        if harvest.poll() != None:
            # Update your progressbar to finished.
            progress_bar.stop()
            #if harvest finishes OK then show confirmation message otherwise show error.
            if harvest.returncode == 0:
                mes = tkms.showinfo(title='progress', message='Scraping Done')
                if mes == 'ok':
                    root.destroy()
            else:
                tkms.showinfo(title='Error', message=f'harvest returncode == {harvest.returncode}')

        harvest = None

        else:
            # indicate that process is running.
            progress_bar.grid()
            progress_bar.start(10)
            root.after(100, watch)


def scrape():
    global harvest
    command_line = shlex.split('scrapy runspider ./scrape.py')
    with open ('stdout.txt', 'wb') as out, open('stderr', 'wb') as err:
        harvest = Popen(command_line, stdout=out, stderr=err)
    watch()


root = tk.Tk()
root.title("Title")

url = tk.StringVar(root)

entry1 = tk.Entry(root, width=90, textvariable=url)
entry1.grid(row=0, column=0, columnspan=3)

my_button = tk.Button(root, text="Process", command=lambda: [get_url(), scrape()])
my_button.grid(row=2, column=2)

progress_bar = ttk.Progressbar(root, orient=tk.HORIZONTAL, length=300, mode='indeterminate')
progress_bar.grid(row=3, column=2)
progress_bar.grid_forget()

root.mainloop()

Scrapy reproducable code

import scrapy
import json


class ImgSpider(scrapy.Spider):
    name = 'img'

    #allowed_domains = [user_domain]
    start_urls = ['xyz']

    def parse(self, response):
        title = response.css('img::attr(alt)').getall()
        links = response.css('img::attr(src)').getall()

        with open('../images/urls.txt', 'w') as f:
            for i in title:
                f.write(i)
            f.close

Solution

I was able to get it to work.

Steps to reproduce...

Open a new directory and start a new python virtual enviornment, and update pip install scrapy and install pyinstaller into the virtual environement.

In the new directory create the the two python scripts... mine is main.py and scrape.py

main.py

import tkinter as tk
from tkinter import messagebox as tkms
from tkinter import ttk
import shlex
import os
import scrapy
from subprocess import Popen
import json

def get_path(name):
    return os.path.join(os.path.dirname(__file__),name).replace("\\","/")

harvest = None

def watch():
    global harvest
    if harvest:
        if harvest.poll() != None:
            # Update your progressbar to finished.
            progress_bar.stop()
            #if harvest finishes OK then show confirmation message otherwise show error.
            if harvest.returncode == 0:
                mes = tkms.showinfo(title='progress', message='Scraping Done')
                if mes == 'ok':
                    root.destroy()
            else:
                tkms.showinfo(title='Error', message=f'harvest returncode == {harvest.returncode}')

        harvest = None

    else:
        # indicate that process is running.
        progress_bar.grid()
        progress_bar.start(10)
        root.after(100, watch)


def scrape():
    global harvest
    command_line = shlex.split('scrapy runspider ' + get_url('scrape.py'))
    with open ('stdout.txt', 'wb') as out, open('stderr.txt', 'wb') as err:
        harvest = Popen(command_line, stdout=out, stderr=err)
    watch()


root = tk.Tk()
root.title("Title")

url = tk.StringVar(root)

entry1 = tk.Entry(root, width=90, textvariable=url)
entry1.grid(row=0, column=0, columnspan=3)

my_button = tk.Button(root, text="Process", command=scrape)
my_button.grid(row=2, column=2)

progress_bar = ttk.Progressbar(root, orient=tk.HORIZONTAL, length=300, mode='indeterminate')
progress_bar.grid(row=3, column=2)
progress_bar.grid_forget()

root.mainloop()

scrape.py

import scrapy
import os

class ImgSpider(scrapy.Spider):
    name = 'img'

    #allowed_domains = [user_domain]
    start_urls = ['https://www.bbc.com/news/in_pictures']  # i just used this for testing.

    def parse(self, response):
        title = response.css('img::attr(alt)').getall()
        links = response.css('img::attr(src)').getall()

        if not os.path.exists('./images'):
            os.makedirs('./images')
        with open('./images/urls.txt', 'w') as f:
            for i in title:
                f.write(i)
            f.close
        yield {"title": title, "links": links}

Then run pyinstaller -F main.py, which will then generate a main.spec file. open that and make these changes to the file.

main.spec

# -*- mode: python ; coding: utf-8 -*-

block_cipher = None
import os

scrape = "scrape.py"
imagesdir = "images"  

a = Analysis(
    ['main.py'],
    pathex=[],
    binaries=[],
    datas=[(scrape,'.'), (imagesdir,'.')],  # add these lines
    hiddenimports=[],
    hookspath=[],
    hooksconfig={},
    runtime_hooks=[],
    excludes=[],
    win_no_prefer_redirects=False,
    win_private_assemblies=False,
    cipher=block_cipher,
    noarchive=False,
)
pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)

exe = EXE(
    pyz,
    a.scripts,
    a.binaries,
    a.zipfiles,
    a.datas,
    [],
    name='main',
    debug=False,
    bootloader_ignore_signals=False,
    strip=False,
    upx=True,
    upx_exclude=[],
    runtime_tmpdir=None,
    console=True,  # Once you have confirmed it is working you can set this to false
    disable_windowed_traceback=False,
    argv_emulation=False,
    target_arch=None,
    codesign_identity=None,
    entitlements_file=None,
)

Then once that is all done. go back to your terminal and run pyinstaller main.spec, and bobs your uncle...

Update

main.py -

I essentially just removed the shlex portion and made the path to scrape.py relative to the main.py file path.

import tkinter as tk
from tkinter import messagebox as tkms
from tkinter import ttk
from subprocess import Popen
import json
import os


def get_url():
    print('Getting URL...')
    data = url.get()
    if not os.path.exists('./data'):
        os.makedirs('./data')
    with open('./data/url.json', 'w') as f:
        json.dump(data, f)

harvest = None

def watch():
    global harvest
    print('watch started')
    if harvest:
        if harvest.poll() != None:
            print('progress bar ends')
            # Update your progressbar to finished.
            progress_bar.stop()
            #if harvest finishes OK then show confirmation message otherwise show error.
            if harvest.returncode == 0:
                mes = tkms.showinfo(title='progress', message='Scraping Done')
                if mes == 'ok':
                    root.destroy()
            else:
                tkms.showinfo(title='Error', message=f'harvest returncode == {harvest.returncode}')

            # Maybe report harvest.returncode?
            print(f'harvest return code if Poll !None =--######==== {harvest.returncode}')
            print(f'harvest poll =--######==== {harvest.poll}')
            # Re-schedule `watch` to be called again after 0.1 s.
            harvest = None

        else:
            # indicate that process is running.
            print('progress bar starts')
            progress_bar.grid()
            progress_bar.start(10)
            print(f'harvest return code =--######==== {harvest.returncode}')
            root.after(100, watch)


def scrape():
    global harvest
    scrapefile = os.path.join(os.path.dirname(__file__),'scrape.py')
    # harvest = Popen(command_line)
    with open ('stdout.txt', 'wb') as out, open('stderr.txt', 'wb') as err:
        # harvest = Popen('scrapy runspider ./scrape.py', stdout=out, stderr=err, shell=True)
        harvest = Popen(["python3", scrapefile], stdout=out, stderr=err)
        out.close(), err.close()
    print('harvesting started')
    watch()


root = tk.Tk()
root.title("Title")

url = tk.StringVar(root)

entry1 = tk.Entry(root, width=90, textvariable=url)
entry1.grid(row=0, column=0, columnspan=3)

my_button = tk.Button(root, text="Process", command=lambda: [get_url(), scrape()])
my_button.grid(row=2, column=2)

progress_bar = ttk.Progressbar(root, orient=tk.HORIZONTAL, length=300, mode='indeterminate')
progress_bar.grid(row=3, column=2)
progress_bar.grid_forget()

root.mainloop()

main.spec

# -*- mode: python ; coding: utf-8 -*-
block_cipher = None
a = Analysis(['main.py'], pathex=[], binaries=[],
    datas=[('scrape.py','.')],   # <------- this is the only change that I made
    hiddenimports=[], hookspath=[],
    hooksconfig={}, runtime_hooks=[], excludes=[],
    win_no_prefer_redirects=False, win_private_assemblies=False,
    cipher=block_cipher, noarchive=False,)
pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
exe = EXE(pyz, a.scripts, a.binaries, a.zipfiles, a.datas, [],
    name='main', debug=False, bootloader_ignore_signals=False, strip=False,
    upx=True, upx_exclude=[], runtime_tmpdir=None, console=False,
    disable_windowed_traceback=False, argv_emulation=False, target_arch=None,
    codesign_identity=None, entitlements_file=None,)

I made no changes to the scrape.py

Answered By - Alexander

This Answer collected from stackoverflow and tested by PythonFixing community admins, is licensed under cc by-sa 2.5 , cc by-sa 3.0 and cc by-sa 4.0

Sunday, September 25, 2022

[FIXED] Scrapy error "twisted.web._newclient.RequestGenerationFailed"

Issue

Solution

Update

0 comments:

Post a Comment

Popular Posts

Labels