Issue
In Pycharm venv
, I have created a gui with pyinstaller
, which calls scrapy
on a file with Popen
. From terminal Popen
was calling scrapy
and scraping was being done successfully. After packing gui was being opened but stderr of Popen was telling scrapy not found
. A issue I opened on Githubissue helped me to find out that pyinstaller
was using the user
package instead of venv
. I solved it by installing scrapy for the user
and after packing with pyinstaller
the built gui
is calling scrapy. Still don't know why pyinstaller
didn't use venv
package.
But now I am getting scrapy error
Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/twisted/internet/defer.py", line 62, in run
return f(*args, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/scrapy/core/downloader/middleware.py", line 49, in process_request
return (yield download_func(request=request, spider=spider))
twisted.web._newclient.RequestGenerationFailed: [<twisted.python.failure.Failure builtins.AttributeError: __enter__>]
Even if I run scrapy as user (outside venv), in terminal I get this error. Inside venv
running in terminal things are fine.
Also is it the ideal way to pack from pyinstaller
i.e. installing packages in user as well, so that pyinstaller
can get them?
Gui code
import tkinter as tk
from tkinter import messagebox as tkms
from tkinter import ttk
import shlex
from subprocess import Popen
import json
def get_url():
#printing Entry url to a file
harvest = None
def watch():
global harvest
if harvest:
if harvest.poll() != None:
# Update your progressbar to finished.
progress_bar.stop()
#if harvest finishes OK then show confirmation message otherwise show error.
if harvest.returncode == 0:
mes = tkms.showinfo(title='progress', message='Scraping Done')
if mes == 'ok':
root.destroy()
else:
tkms.showinfo(title='Error', message=f'harvest returncode == {harvest.returncode}')
harvest = None
else:
# indicate that process is running.
progress_bar.grid()
progress_bar.start(10)
root.after(100, watch)
def scrape():
global harvest
command_line = shlex.split('scrapy runspider ./scrape.py')
with open ('stdout.txt', 'wb') as out, open('stderr', 'wb') as err:
harvest = Popen(command_line, stdout=out, stderr=err)
watch()
root = tk.Tk()
root.title("Title")
url = tk.StringVar(root)
entry1 = tk.Entry(root, width=90, textvariable=url)
entry1.grid(row=0, column=0, columnspan=3)
my_button = tk.Button(root, text="Process", command=lambda: [get_url(), scrape()])
my_button.grid(row=2, column=2)
progress_bar = ttk.Progressbar(root, orient=tk.HORIZONTAL, length=300, mode='indeterminate')
progress_bar.grid(row=3, column=2)
progress_bar.grid_forget()
root.mainloop()
Scrapy reproducable code
import scrapy
import json
class ImgSpider(scrapy.Spider):
name = 'img'
#allowed_domains = [user_domain]
start_urls = ['xyz']
def parse(self, response):
title = response.css('img::attr(alt)').getall()
links = response.css('img::attr(src)').getall()
with open('../images/urls.txt', 'w') as f:
for i in title:
f.write(i)
f.close
Solution
I was able to get it to work.
Steps to reproduce...
Open a new directory and start a new python virtual enviornment, and update pip install scrapy and install pyinstaller into the virtual environement.
In the new directory create the the two python scripts... mine is main.py
and scrape.py
main.py
import tkinter as tk
from tkinter import messagebox as tkms
from tkinter import ttk
import shlex
import os
import scrapy
from subprocess import Popen
import json
def get_path(name):
return os.path.join(os.path.dirname(__file__),name).replace("\\","/")
harvest = None
def watch():
global harvest
if harvest:
if harvest.poll() != None:
# Update your progressbar to finished.
progress_bar.stop()
#if harvest finishes OK then show confirmation message otherwise show error.
if harvest.returncode == 0:
mes = tkms.showinfo(title='progress', message='Scraping Done')
if mes == 'ok':
root.destroy()
else:
tkms.showinfo(title='Error', message=f'harvest returncode == {harvest.returncode}')
harvest = None
else:
# indicate that process is running.
progress_bar.grid()
progress_bar.start(10)
root.after(100, watch)
def scrape():
global harvest
command_line = shlex.split('scrapy runspider ' + get_url('scrape.py'))
with open ('stdout.txt', 'wb') as out, open('stderr.txt', 'wb') as err:
harvest = Popen(command_line, stdout=out, stderr=err)
watch()
root = tk.Tk()
root.title("Title")
url = tk.StringVar(root)
entry1 = tk.Entry(root, width=90, textvariable=url)
entry1.grid(row=0, column=0, columnspan=3)
my_button = tk.Button(root, text="Process", command=scrape)
my_button.grid(row=2, column=2)
progress_bar = ttk.Progressbar(root, orient=tk.HORIZONTAL, length=300, mode='indeterminate')
progress_bar.grid(row=3, column=2)
progress_bar.grid_forget()
root.mainloop()
scrape.py
import scrapy
import os
class ImgSpider(scrapy.Spider):
name = 'img'
#allowed_domains = [user_domain]
start_urls = ['https://www.bbc.com/news/in_pictures'] # i just used this for testing.
def parse(self, response):
title = response.css('img::attr(alt)').getall()
links = response.css('img::attr(src)').getall()
if not os.path.exists('./images'):
os.makedirs('./images')
with open('./images/urls.txt', 'w') as f:
for i in title:
f.write(i)
f.close
yield {"title": title, "links": links}
Then run pyinstaller -F main.py
, which will then generate a main.spec
file. open that and make these changes to the file.
main.spec
# -*- mode: python ; coding: utf-8 -*-
block_cipher = None
import os
scrape = "scrape.py"
imagesdir = "images"
a = Analysis(
['main.py'],
pathex=[],
binaries=[],
datas=[(scrape,'.'), (imagesdir,'.')], # add these lines
hiddenimports=[],
hookspath=[],
hooksconfig={},
runtime_hooks=[],
excludes=[],
win_no_prefer_redirects=False,
win_private_assemblies=False,
cipher=block_cipher,
noarchive=False,
)
pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
exe = EXE(
pyz,
a.scripts,
a.binaries,
a.zipfiles,
a.datas,
[],
name='main',
debug=False,
bootloader_ignore_signals=False,
strip=False,
upx=True,
upx_exclude=[],
runtime_tmpdir=None,
console=True, # Once you have confirmed it is working you can set this to false
disable_windowed_traceback=False,
argv_emulation=False,
target_arch=None,
codesign_identity=None,
entitlements_file=None,
)
Then once that is all done. go back to your terminal and run pyinstaller main.spec, and bobs your uncle...
Update
main.py -
I essentially just removed the
shlex
portion and made the path to scrape.py relative to the main.py file path.
import tkinter as tk
from tkinter import messagebox as tkms
from tkinter import ttk
from subprocess import Popen
import json
import os
def get_url():
print('Getting URL...')
data = url.get()
if not os.path.exists('./data'):
os.makedirs('./data')
with open('./data/url.json', 'w') as f:
json.dump(data, f)
harvest = None
def watch():
global harvest
print('watch started')
if harvest:
if harvest.poll() != None:
print('progress bar ends')
# Update your progressbar to finished.
progress_bar.stop()
#if harvest finishes OK then show confirmation message otherwise show error.
if harvest.returncode == 0:
mes = tkms.showinfo(title='progress', message='Scraping Done')
if mes == 'ok':
root.destroy()
else:
tkms.showinfo(title='Error', message=f'harvest returncode == {harvest.returncode}')
# Maybe report harvest.returncode?
print(f'harvest return code if Poll !None =--######==== {harvest.returncode}')
print(f'harvest poll =--######==== {harvest.poll}')
# Re-schedule `watch` to be called again after 0.1 s.
harvest = None
else:
# indicate that process is running.
print('progress bar starts')
progress_bar.grid()
progress_bar.start(10)
print(f'harvest return code =--######==== {harvest.returncode}')
root.after(100, watch)
def scrape():
global harvest
scrapefile = os.path.join(os.path.dirname(__file__),'scrape.py')
# harvest = Popen(command_line)
with open ('stdout.txt', 'wb') as out, open('stderr.txt', 'wb') as err:
# harvest = Popen('scrapy runspider ./scrape.py', stdout=out, stderr=err, shell=True)
harvest = Popen(["python3", scrapefile], stdout=out, stderr=err)
out.close(), err.close()
print('harvesting started')
watch()
root = tk.Tk()
root.title("Title")
url = tk.StringVar(root)
entry1 = tk.Entry(root, width=90, textvariable=url)
entry1.grid(row=0, column=0, columnspan=3)
my_button = tk.Button(root, text="Process", command=lambda: [get_url(), scrape()])
my_button.grid(row=2, column=2)
progress_bar = ttk.Progressbar(root, orient=tk.HORIZONTAL, length=300, mode='indeterminate')
progress_bar.grid(row=3, column=2)
progress_bar.grid_forget()
root.mainloop()
main.spec
# -*- mode: python ; coding: utf-8 -*-
block_cipher = None
a = Analysis(['main.py'], pathex=[], binaries=[],
datas=[('scrape.py','.')], # <------- this is the only change that I made
hiddenimports=[], hookspath=[],
hooksconfig={}, runtime_hooks=[], excludes=[],
win_no_prefer_redirects=False, win_private_assemblies=False,
cipher=block_cipher, noarchive=False,)
pyz = PYZ(a.pure, a.zipped_data, cipher=block_cipher)
exe = EXE(pyz, a.scripts, a.binaries, a.zipfiles, a.datas, [],
name='main', debug=False, bootloader_ignore_signals=False, strip=False,
upx=True, upx_exclude=[], runtime_tmpdir=None, console=False,
disable_windowed_traceback=False, argv_emulation=False, target_arch=None,
codesign_identity=None, entitlements_file=None,)
I made no changes to the scrape.py
Answered By - Alexander
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.