Scrape multiple urls using QWebPage

The problem with your program is that you are attempting to create a new QApplication with every url you fetch.

Instead, only one QApplication and one WebPage should be created. The WebPage can use its loadFinished signal to create an internal loop by fetching a new url after each one has been processed. Custom html processing can be added by connecting a user-defined slot to a signal which emits the html text and the url when they become available. The scripts below (for PyQt5 and PyQt4) show how to implement this.

Here are some examples which show how to use the WebPage class:

Usage:

def my_html_processor(html, url):
    print('loaded: [%d chars] %s' % (len(html), url))

import sys
app = QApplication(sys.argv)
webpage = WebPage(verbose=False)
webpage.htmlReady.connect(my_html_processor)

# example 1: process list of urls

urls = ['https://en.wikipedia.org/wiki/Special:Random'] * 3
print('Processing list of urls...')
webpage.process(urls)

# example 2: process one url continuously
#
# import signal, itertools
# signal.signal(signal.SIGINT, signal.SIG_DFL)
#
# print('Processing url continuously...')
# print('Press Ctrl+C to quit')
#
# url="https://en.wikipedia.org/wiki/Special:Random"
# webpage.process(itertools.repeat(url))

sys.exit(app.exec_())

PyQt5 WebPage:

from PyQt5.QtCore import pyqtSignal, QUrl
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEnginePage

class WebPage(QWebEnginePage):
    htmlReady = pyqtSignal(str, str)

    def __init__(self, verbose=False):
        super().__init__()
        self._verbose = verbose
        self.loadFinished.connect(self.handleLoadFinished)

    def process(self, urls):
        self._urls = iter(urls)
        self.fetchNext()

    def fetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            return False
        else:
            self.load(QUrl(url))
        return True

    def processCurrentPage(self, html):
        self.htmlReady.emit(html, self.url().toString())
        if not self.fetchNext():
            QApplication.instance().quit()

    def handleLoadFinished(self):
        self.toHtml(self.processCurrentPage)

    def javaScriptConsoleMessage(self, *args, **kwargs):
        if self._verbose:
            super().javaScriptConsoleMessage(*args, **kwargs)

PyQt4 WebPage:

from PyQt4.QtCore import pyqtSignal, QUrl
from PyQt4.QtGui import QApplication
from PyQt4.QtWebKit import QWebPage

class WebPage(QWebPage):
    htmlReady = pyqtSignal(str, str)

    def __init__(self, verbose=False):
        super(WebPage, self).__init__()
        self._verbose = verbose
        self.mainFrame().loadFinished.connect(self.handleLoadFinished)

    def start(self, urls):
        self._urls = iter(urls)
        self.fetchNext()

    def fetchNext(self):
        try:
            url = next(self._urls)
        except StopIteration:
            return False
        else:
            self.mainFrame().load(QUrl(url))
        return True

    def processCurrentPage(self):
        self.htmlReady.emit(
            self.mainFrame().toHtml(), self.mainFrame().url().toString())
        print('loaded: [%d bytes] %s' % (self.bytesReceived(), url))

    def handleLoadFinished(self):
        self.processCurrentPage()
        if not self.fetchNext():
            QApplication.instance().quit()

    def javaScriptConsoleMessage(self, *args, **kwargs):
        if self._verbose:
            super(WebPage, self).javaScriptConsoleMessage(*args, **kwargs)

Leave a Comment