How to get all the links (a href) from URL page with scrapy save them to a JSON file in Python

3 Answers

0 votes
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

class Spider(CrawlSpider):
    name = "test"

    allowed_domains = ['https://www.collectivesolver.com']
    start_urls = ['https://www.collectivesolver.com']

    def __init__(self):
        self.links = []

    def parse(self, response):
        self.links.append(response.url)
        for href in response.css('a::attr(href)'):
            yield {
                'url': response.follow(href, self.parse)
            }

# Windows 10
# Visual Studio Code - In TERMINAL
 
# scrapy crawl test -o data.json


'''
run:

[
{"url": "<Request GET https://www.collectivesolver.com/>"},
{"url": "<Request GET https://www.collectivesolver.com/questions>"},
{"url": "<Request GET https://www.collectivesolver.com/tags>"},
{"url": "<Request GET https://www.collectivesolver.com/users>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/python>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/php>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/java>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/c%23>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/javascript>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/cpp>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/c>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/vb%23>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/html>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/bootstrap>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/css>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/sql>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/go>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/mysql>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/jquery>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/nodejs>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/reactjs>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/nodejs-express>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/angularjs>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/html5>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/postgresql>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/dom>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/winapi>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/win32>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/android-java>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/bootstrap4>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/css3>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/visual-studio-code>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/software>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/web-hosting>"},
{"url": "<Request GET https://www.collectivesolver.com/user/avibootz>"},
{"url": "<Request GET https://www.collectivesolver.com/tag/c>"},
{"url": "<Request GET https://www.collectivesolver.com/questions>"},
{"url": "<Request GET https://www.collectivesolver.com/tags>"},
{"url": "<Request GET https://www.collectivesolver.com/feedback>"},
...
]

'''

 



answered Jun 18, 2020 by avibootz
edited Jun 18, 2020 by avibootz
0 votes
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

class Spider(CrawlSpider):
    name = "test"

    allowed_domains = ['https://www.collectivesolver.com']
    start_urls = ['https://www.collectivesolver.com']

    def __init__(self):
        self.links = []

    def parse(self, response):
        self.links.append(response.url)
        for href in response.css('a::attr(href)'):
            yield {
                'url': response.follow(href, self.parse).url
            }
            

# Windows 10
# Visual Studio Code - In TERMINAL
# scrapy crawl test -o data.json



'''
run:

[
{"url": "https://www.collectivesolver.com/"},
{"url": "https://www.collectivesolver.com/questions"},
{"url": "https://www.collectivesolver.com/tags"},
{"url": "https://www.collectivesolver.com/users"},
{"url": "https://www.collectivesolver.com/tag/python"},
{"url": "https://www.collectivesolver.com/tag/php"},
{"url": "https://www.collectivesolver.com/tag/java"},
{"url": "https://www.collectivesolver.com/tag/c%23"},
{"url": "https://www.collectivesolver.com/tag/javascript"},
{"url": "https://www.collectivesolver.com/tag/cpp"},
{"url": "https://www.collectivesolver.com/tag/c"},
{"url": "https://www.collectivesolver.com/tag/vb%23"},
{"url": "https://www.collectivesolver.com/tag/html"},
{"url": "https://www.collectivesolver.com/tag/bootstrap"},
{"url": "https://www.collectivesolver.com/tag/css"},
{"url": "https://www.collectivesolver.com/tag/sql"},
{"url": "https://www.collectivesolver.com/tag/go"},
{"url": "https://www.collectivesolver.com/tag/mysql"},
{"url": "https://www.collectivesolver.com/tag/jquery"},
{"url": "https://www.collectivesolver.com/tag/nodejs"},
{"url": "https://www.collectivesolver.com/tag/reactjs"},
{"url": "https://www.collectivesolver.com/tag/nodejs-express"},
{"url": "https://www.collectivesolver.com/tag/angularjs"},
{"url": "https://www.collectivesolver.com/tag/html5"},
{"url": "https://www.collectivesolver.com/tag/postgresql"},
{"url": "https://www.collectivesolver.com/tag/dom"},
{"url": "https://www.collectivesolver.com/tag/winapi"},
{"url": "https://www.collectivesolver.com/tag/win32"},
{"url": "https://www.collectivesolver.com/tag/android-java"},
{"url": "https://www.collectivesolver.com/tag/bootstrap4"},
{"url": "https://www.collectivesolver.com/tag/css3"},
{"url": "https://www.collectivesolver.com/tag/visual-studio-code"},
{"url": "https://www.collectivesolver.com/tag/software"},
{"url": "https://www.collectivesolver.com/tag/web-hosting"},
{"url": "https://www.collectivesolver.com/tag/binary"},
{"url": "https://www.collectivesolver.com/tag/wordpress"},
{"url": "https://www.collectivesolver.com/tag/phpmyadmin"},
{"url": "https://www.collectivesolver.com/tag/firefox"},
{"url": "https://www.collectivesolver.com/tag/wpf"},
{"url": "https://www.collectivesolver.com/tag/clisp"},
{"url": "https://www.collectivesolver.com/tag/laravel"},
{"url": "https://www.collectivesolver.com/tag/netbeans"},
{"url": "https://www.collectivesolver.com/tag/prototype"},
{"url": "https://www.collectivesolver.com/tag/ide"},
{"url": "https://www.collectivesolver.com/tag/dot-net-library"},
{"url": "https://www.collectivesolver.com/tag/opengl"},
{"url": "https://www.collectivesolver.com/tag/xampp"},
{"url": "https://www.collectivesolver.com/tag/windows"},
{"url": "https://www.collectivesolver.com/tag/download"},
{"url": "https://www.collectivesolver.com/tag/fontawesome5"},
{"url": "https://www.collectivesolver.com/tag/xdebug"},
{"url": "https://www.collectivesolver.com/tag/iphone"},
{"url": "https://www.collectivesolver.com/tag/hardware"},
{"url": "https://www.collectivesolver.com/tag/apache"},
{"url": "https://www.collectivesolver.com/tag/webgl"},
{"url": "https://www.collectivesolver.com/tag/xml"},
{"url": "https://www.collectivesolver.com/tag/xhtml"},
{"url": "https://www.collectivesolver.com/tag/browser"},
{"url": "https://www.collectivesolver.com/tag/programming"},
{"url": "https://www.collectivesolver.com/user/avibootz"},
{"url": "https://www.collectivesolver.com/tag/python"},
{"url": "https://www.collectivesolver.com/31926/how-to-use-yield-in-python"},
...
]

'''

 



answered Jun 18, 2020 by avibootz
edited Jun 18, 2020 by avibootz
0 votes
from scrapy import Spider
from scrapy.linkextractors import LinkExtractor


class MySpider(Spider):
    name = 'test'
    start_urls = ['https://blog.scrapinghub.com/',
                  'https://www.collectivesolver.com/']

    def parse(self, response):
        le = LinkExtractor()
        for link in le.extract_links(response):
            yield {
                'url': link.url
            }

 
# Windows 10
# Visual Studio Code - In TERMINAL
  
# scrapy crawl test -o data.json
 
 
'''
run:
 
[
{"url": "https://www.collectivesolver.com/tag/reactjs"},
{"url": "https://www.collectivesolver.com/tag/nodejs-express"},
{"url": "https://blog.scrapinghub.com/author/robert-cosgrave"},
{"url": "https://blog.scrapinghub.com/author/ivan-ivanov-and-warley-ferreira-lopes"},
{"url": "https://blog.scrapinghub.com/guide-to-web-data-extraction-qa-validation-techniques"},
{"url": "https://blog.scrapinghub.com/web-data-qa-common-validation-pitfalls"},
{"url": "https://blog.scrapinghub.com/author/attila-t%C3%B3th"},
{"url": "https://blog.scrapinghub.com/author/j%C3%BAlio-c%C3%A9sar-batista"},
{"url": "https://www.collectivesolver.com/tag/angularjs"},
{"url": "https://www.collectivesolver.com/tag/html5"},
...
]
 
'''

 



answered Jun 19, 2020 by avibootz

Related questions

1 answer 212 views
1 answer 502 views
1 answer 252 views
...