Issue
So these are my work files and then I have also added the terminal log I received when ran, thanks!
Settings
# Scrapy settings for antaira project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'antaira'
SPIDER_MODULES = ['antaira.spiders']
NEWSPIDER_MODULE = 'antaira.spiders'
CLOSESPIDER_PAGECOUNT = 25
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'antaira (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'antaira.middlewares.AntairaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'antaira.middlewares.AntairaDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'antaira.pipelines.AntairaPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
#Domain Limited
#MAX_REQUESTS_PER_DOMAIN = 4
DOWNLOADER_MIDDLEWARES = {
#'<myproject>.middlewares.DomainlimitMiddleware': 543,
}
DUPEFILTER_CLASS = 'scrapy.dupefilters.BaseDupeFilter'
Item pipeline should be fairly standard PipeLine
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import json
class AntairaPipeline:
def process_item(self, item, spider):
# calling dumps to create json data.
line = json.dumps(dict(item)) + "\n"
self.file.write(line)
return item
def open_spider(self, spider):
self.file = open('result.json', 'w')
def close_spider(self, spider):
self.file.close()
Standard Items, nothing exceptional. Items
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class AntairaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
product_sku = scrapy.Field()
summary = scrapy.Field()
description = scrapy.Field()
products_zoom_image = scrapy.Field()
main_image = scrapy.Field()
product_link = scrapy.Field()
#rel_product_link = scrapy.Field()
#rel_links = scrapy.Field()
#datasheet = scrapy.Field()
I changed some of the field names to match beter to my team's database names. Scrapy Spider
import scrapy
from ..items import AntairaItem
class ProductJumperFix(scrapy.Spider): # classes should be TitleCase
name = 'productJumperFix'
allowed_domains = ['antaira.com']
custom_settings = {
'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter',
}
def start_requests(self):
urls = [
'https://www.antaira.com/products/10-100Mbps',
'https://www.antaira.com/products/unmanaged-gigabit',
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE',
'https://www.antaira.com/products/unmanaged-10-100Mbps-PoE?range=41%2C48%2C48',
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE',
'https://www.antaira.com/products/Unmanaged-Gigabit-PoE?range=41%2C43%2C43',
'https://www.antaira.com/products/Unmanaged-10-gigabit',
'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE',
'https://www.antaira.com/products/unmanaged-gigabit',
]
for url in urls:
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
# iterate through each of the relative urls
for url in response.xpath('//div[@class="product-container"]//a/@href').getall():
product_link = response.urljoin(url) # use variable
yield scrapy.Request(product_link, callback=self.parse_new_item)
def parse_new_item(self, response):
for product in response.css('main.products'):
items = AntairaItem() # Unique item for each iteration
items['product_link'] = response.url # get the product link from response
name_dirty = product.css('h1.product-name::text').get()
product_sku = name_dirty.strip()
summary = product.css(('section.features h3 + ul')).getall()
description = product.css('.products .product-overview::text').getall()
products_zoom_image = name_dirty.strip() + '.jpg'
main_image = response.urljoin(product.css('div.selectors img::attr(src)').get())
rel_links = product.xpath("//script/@src[contains(., '/app/site/hosting/scriptlet.nl')]").getall()
items['product_sku'] = product_sku,
items['summary'] = summary,
items['description'] = description,
items['products_zoom_image'] = products_zoom_image
items['main_image'] = main_image,
#items['rel_links'] = rel_links,
#items['datasheet'] = datasheet,
yield items
I have trimmed a majority of the log and kept the only parts that it crawld but did not scrape. Terminal Log
joel@testbed:~/Desktop/antaira/antaira/spiders$ scrapy crawl productJumperFix -O products.csv
2022-08-15 16:33:34 [scrapy.utils.log] INFO: Scrapy 2.6.2 started (bot: antaira)
2022-08-15 16:33:34 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 18.9.0, Python 3.8.10 (default, Jun 22 2022, 20:18:18) - [GCC 9.4.0], pyOpenSSL 19.0.0 (OpenSSL 1.1.1f 31 Mar 2020), cryptography 2.8, Platform Linux-5.15.0-46-generic-x86_64-with-glibc2.29
2022-08-15 16:33:34 [scrapy.crawler] INFO: Overridden settings:
{'BOT_NAME': 'antaira',
'CLOSESPIDER_PAGECOUNT': 25,
'DUPEFILTER_CLASS': 'scrapy.dupefilters.BaseDupeFilter',
'NEWSPIDER_MODULE': 'antaira.spiders',
'ROBOTSTXT_OBEY': True,
'SPIDER_MODULES': ['antaira.spiders']}
2022-08-15 16:33:34 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2022-08-15 16:33:34 [scrapy.extensions.telnet] INFO: Telnet Password: 3f9ff0160659640b
2022-08-15 16:33:34 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.memusage.MemoryUsage',
'scrapy.extensions.closespider.CloseSpider',
'scrapy.extensions.feedexport.FeedExporter',
'scrapy.extensions.logstats.LogStats']
2022-08-15 16:33:34 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware',
'scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2022-08-15 16:33:34 [scrapy.middleware] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2022-08-15 16:33:34 [scrapy.middleware] INFO: Enabled item pipelines:
['antaira.pipelines.AntairaPipeline']
2022-08-15 16:33:34 [scrapy.core.engine] INFO: Spider opened
2022-08-15 16:33:34 [scrapy.extensions.logstats] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2022-08-15 16:33:34 [scrapy.extensions.telnet] INFO: Telnet console listening on 127.0.0.1:6023
2022-08-15 16:33:35 [filelock] DEBUG: Attempting to acquire lock 140158480454080 on /home/joel/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-08-15 16:33:35 [filelock] DEBUG: Lock 140158480454080 acquired on /home/joel/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-08-15 16:33:35 [filelock] DEBUG: Attempting to release lock 140158480454080 on /home/joel/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-08-15 16:33:35 [filelock] DEBUG: Lock 140158480454080 released on /home/joel/.cache/python-tldextract/3.8.10.final__usr__7d8fdf__tldextract-3.3.1/publicsuffix.org-tlds/de84b5ca2167d4c83e38fb162f2e8738.tldextract.json.lock
2022-08-15 16:33:35 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/robots.txt> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit-PoE> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-Gigabit-PoE?range=41%2C43%2C43> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/10-100Mbps> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-Gigabit-PoE> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-10-100Mbps-PoE?range=41%2C48%2C48> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-10-100Mbps-PoE> (referer: None)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit-PoE/LNP-1204G-10G-SFP-24> (referer: https://www.antaira.com/products/Unmanaged-10-gigabit-PoE)
2022-08-15 16:33:36 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit> (referer: None)
2022-08-15 16:33:37 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.antaira.com/products/Unmanaged-10-gigabit-PoE/LNP-1204G-10G-SFP-24>
{'description': (['Antaira Technologies’ LNP-1204G-10G-SFP-24 are industrial '
'gigabit PoE+ unmanaged Ethernet switches featuring '
'8*10/100/1000Tx Gigabit Ethernet ports that support '
'IEEE802.3at for a maximum of 30W/port. The '
'LNP-1204G-10G-SFP-24 has 2*1G SFP slots and 2*10G SFP+ '
'slots which provide options for long-distance fiber '
'connections. The Ethernet switches are designed with high '
'EFT and ESD protection and support standard operating '
'temperature from -40° to 65°C.',
'The LNP-1204G-10G-SFP-24 are IP30 rated and DIN-rail '
'mountable. These Ethernet switches are designed to be '
'powered with low voltage input (12~55VDC) while still '
'providing the higher voltages required by the PoE '
'standards. Additionally, these industrial PoE Ethernet '
'switches provide connectivity for outdoor or harsh '
'industrial automation application environments, such as '
'security surveillance, ITS-traffic monitoring systems, '
'oil/gas and mining, facility management for power/utility, '
'water wastewater treatment plants, and lastly, automated '
'production lines in factory automation.'],),
'main_image': ('https://www.antaira.com/core/media/media.nl?id=1553822&c=685553&h=KRqHvivRvzYNGs_zSsw3x5fAu9EoYxBr3AAjkX2TH7iCoXyh',),
'product_link': 'https://www.antaira.com/products/Unmanaged-10-gigabit-PoE/LNP-1204G-10G-SFP-24',
'product_sku': ('LNP-1204G-10G-SFP-24',),
'products_zoom_image': 'LNP-1204G-10G-SFP-24.jpg',
'summary': (['<ul>\r\n'
'<li>Supports 8*10/100/1000Tx IEEE 802.3af/at Compliant with '
'30W/Port, 2*1G SFP Slots, and 2*10G SFP+ Slots</li>\r\n'
'<li>Store-and-Forward Switching Architecture</li>\r\n'
'<li>60Gbps Back-Plane (Switching Fabric)</li>\r\n'
'<li>16K MAC Address Table</li>\r\n'
'<li>10Kbytes Jumbo Frame Support</li>\r\n'
'<li>Redundant Power Input Design: 12~55VDC</li>\r\n'
'<li>Bult-in 1 Relay Output for Power Failure Warning</li>\r\n'
'<li>IP30 Rugged Metal Case Design</li>\r\n'
'<li>DIN-Rail and Wall Mount Support Included</li>\r\n'
'<li>Operating Temperature Range: -40°C~65°C</li>\r\n'
'<li>5-Year Warranty</li>\r\n'
'</ul>'],)}
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit-PoE/LNP-1204G-10G-SFP> (referer: https://www.antaira.com/products/Unmanaged-10-gigabit-PoE)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit-PoE/LNP-1002G-10G-SFP-24> (referer: https://www.antaira.com/products/Unmanaged-10-gigabit-PoE)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-Gigabit-PoE/LNP-C501G-SFP-bt-T> (referer: https://www.antaira.com/products/Unmanaged-Gigabit-PoE?range=41%2C43%2C43)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit/LNX-1204G-10G-SFP> (referer: https://www.antaira.com/products/Unmanaged-10-gigabit)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-Gigabit-PoE/LNP-C501G-SFP-bt> (referer: https://www.antaira.com/products/Unmanaged-Gigabit-PoE?range=41%2C43%2C43)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit-PoE/LNP-1002G-10G-SFP> (referer: https://www.antaira.com/products/Unmanaged-10-gigabit-PoE)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/Unmanaged-10-gigabit/LNX-1002G-10G-SFP> (referer: https://www.antaira.com/products/Unmanaged-10-gigabit)
2022-08-15 16:33:37 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/10-100Mbps/LNX-0501-ST-M-T> (referer: https://www.antaira.com/products/10-100Mbps)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-C501G-SFP-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/10-100Mbps/LNX-1600-T> (referer: https://www.antaira.com/products/10-100Mbps)
2022-08-15 16:33:38 [scrapy.core.engine] INFO: Closing spider (closespider_pagecount)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-C501G-SFP-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-C800G-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-C800G> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-1204G-10G-SFP> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-C501G-SFP> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-10-100Mbps-PoE/LNP-0800-60-24-T> (referer: https://www.antaira.com/products/unmanaged-10-100Mbps-PoE?range=41%2C48%2C48)
2022-08-15 16:33:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-1600G> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-2004G-SFP-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-1802G-SFP-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-1002G-10G-SFP> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-1600G-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-C500G-T> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-2004G-SFP> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.antaira.com/products/unmanaged-gigabit/LNX-1802G-SFP> (referer: https://www.antaira.com/products/unmanaged-gigabit)
2022-08-15 16:33:39 [scrapy.core.scraper] DEBUG: Scraped from <200 https://www.antaira.com/products/unmanaged-gigabit/LNX-1802G-SFP>
{'description': (['Antaira’s new LNX-1802G-SFP industrial gigabit unmanaged '
'Ethernet switch is IP30 rated and DIN-Rail mountable. Each '
'unit is designed with 16 gigabit Ethernet ports and 2 dual '
'rate (100/1000) SFP slots for fiber connections, making it '
'ideal for applications that demand high bandwidth and long '
'distance communication. \r\n',
'\r\n'
'This product provides high EFT and ESD protection to '
'prevent any unregulated voltage and is suitable for harsh '
'environments. The unit also supports a standard operating '
'temperature from -10 to 70°C. \r\n',
' ',
'\r\n'
'The LNX-1802G-SFP is a perfect industrial networking '
'product to support any applications that require high '
'bandwidth or high density connections, such as '
'Power/Utility, Water Wastewater Treatment, Oil/Gas/Mining, '
'Process Control Automation, Security Access Control '
'Systems, and Intelligent Transportation Systems.'],),
'main_image': ('https://www.antaira.com/core/media/media.nl?id=1236032&c=685553&h=ARdQdDsGuiZpMENJKZsmA3gN6RbhLAQSkBjKdazk1YE_PNrG',),
'product_link': 'https://www.antaira.com/products/unmanaged-gigabit/LNX-1802G-SFP',
'product_sku': ('LNX-1802G-SFP',),
'products_zoom_image': 'LNX-1802G-SFP.jpg',
'summary': (['<ul>\r\n'
'<li>Supports 16*10/100/1000Tx + 2*100/1000 SFP ports </li>\r\n'
'<li>Supports Auto MDI/MDI-X Function</li>\r\n'
'<li>Store-and-Forward Switching Architecture</li>\r\n'
'<li>8K MAC Address Table</li>\r\n'
'<li>Surge Protection: 2,000 VDC Support</li>\r\n'
'<li>ESD Protection: 6,000 VDC Support</li>\r\n'
'<li>Redundant Power Input Design: 12~48VDC</li>\r\n'
'<li>Built-in 1 Relay Output for Power Failure Detection</li>\r\n'
'<li>IP30 Rugged Metal Case Design</li>\r\n'
'<li>DIN-Rail and Wall Mount Support</li>\r\n'
'<li>Operating Temperature Range: -10° to 70° C</li>\r\n'
'<li>5-Year Warranty</li>\r\n'
'</ul>'],)}
2022-08-15 16:33:39 [scrapy.extensions.feedexport] INFO: Stored csv feed (30 items) in: products.csv
2022-08-15 16:33:39 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 24728,
'downloader/request_count': 40,
'downloader/request_method_count/GET': 40,
'downloader/response_bytes': 650133,
'downloader/response_count': 40,
'downloader/response_status_count/200': 40,
'elapsed_time_seconds': 4.618773,
'feedexport/success_count/FileFeedStorage': 1,
'finish_reason': 'closespider_pagecount',
'finish_time': datetime.datetime(2022, 8, 15, 23, 33, 39, 600766),
'httpcompression/response_bytes': 3250008,
'httpcompression/response_count': 39,
'item_scraped_count': 30,
'log_count/DEBUG': 75,
'log_count/INFO': 11,
'memusage/max': 58769408,
'memusage/startup': 58769408,
'request_depth_max': 1,
'response_received_count': 40,
'robotstxt/request_count': 1,
'robotstxt/response_count': 1,
'robotstxt/response_status_count/200': 1,
'scheduler/dequeued': 39,
'scheduler/dequeued/memory': 39,
'scheduler/enqueued': 225,
'scheduler/enqueued/memory': 225,
'start_time': datetime.datetime(2022, 8, 15, 23, 33, 34, 981993)}
2022-08-15 16:33:39 [scrapy.core.engine] INFO: Spider closed (closespider_pagecount)
Solution
You can try changing your user agent and turning off ROBOTSTXT_OBEY and slowing down the crawl. If it is the webserver cutting you off these things might help mitigate that.
settings.py
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 10
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
Answered By - Alexander
0 comments:
Post a Comment
Note: Only a member of this blog may post a comment.