Open In App

Scraping data in network traffic using Python

In this article, we will learn how to scrap data in network traffic using Python.

Modules Needed

There are two ways by which we can scrap the network traffic data.



Method 1: Using selenium’s get_log() method 

To start with this download and extract the chrome webdriver from here according to the version of your chrome browser and copy the executable path.

Approach:



Syntax:

driver.get(url)

Syntax:

driver.get_log(“performance”)

Example:




# Import the required modules
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import json
  
  
# Main Function
if __name__ == "__main__":
  
    # Enable Performance Logging of Chrome.
    desired_capabilities = DesiredCapabilities.CHROME
    desired_capabilities["goog:loggingPrefs"] = {"performance": "ALL"}
  
    # Create the webdriver object and pass the arguments
    options = webdriver.ChromeOptions()
  
    # Chrome will start in Headless mode
    options.add_argument('headless')
  
    # Ignores any certificate errors if there is any
    options.add_argument("--ignore-certificate-errors")
  
    # Startup the chrome webdriver with executable path and
    # pass the chrome options and desired capabilities as
    # parameters.
    driver = webdriver.Chrome(executable_path="C:/chromedriver.exe",
                              chrome_options=options,
                              desired_capabilities=desired_capabilities)
  
    # Send a request to the website and let it load
    driver.get("https://www.geeksforgeeks.org/")
  
    # Sleeps for 10 seconds
    time.sleep(10)
  
    # Gets all the logs from performance in Chrome
    logs = driver.get_log("performance")
  
    # Opens a writable JSON file and writes the logs in it
    with open("network_log.json", "w", encoding="utf-8") as f:
        f.write("[")
  
        # Iterates every logs and parses it using JSON
        for log in logs:
            network_log = json.loads(log["message"])["message"]
  
            # Checks if the current 'method' key has any
            # Network related value.
            if("Network.response" in network_log["method"]
                    or "Network.request" in network_log["method"]
                    or "Network.webSocket" in network_log["method"]):
  
                # Writes the network log to a JSON file by
                # converting the dictionary to a JSON string
                # using json.dumps().
                f.write(json.dumps(network_log)+",")
        f.write("{}]")
  
    print("Quitting Selenium WebDriver")
    driver.quit()
  
    # Read the JSON File and parse it using
    # json.loads() to find the urls containing images.
    json_file_path = "network_log.json"
    with open(json_file_path, "r", encoding="utf-8") as f:
        logs = json.loads(f.read())
  
    # Iterate the logs
    for log in logs:
  
        # Except block will be accessed if any of the
        # following keys are missing.
        try:
            # URL is present inside the following keys
            url = log["params"]["request"]["url"]
  
            # Checks if the extension is .png or .jpg
            if url[len(url)-4:] == ".png" or url[len(url)-4:] == ".jpg":
                print(url, end='\n\n')
        except Exception as e:
            pass

Output:

The image URL’s are highlighted above.

network_log.json containing the image URL’s

Method 2: Using browsermobproxy to capture the HAR file from the network tab of the browser

For this, the following requirements need to be satisfied.

pip install browsermob-proxy

Approach:

Syntax:

driver.get(url)

Example:




# Import the required modules
from selenium import webdriver
from browsermobproxy import Server
import time
import json
  
  
# Main Function
if __name__ == "__main__":
  
    # Enter the path of bin folder by
    # extracting browsermob-proxy-2.1.4-bin
    path_to_browsermobproxy = "C:\\browsermob-proxy-2.1.4\\bin\\"
  
    # Start the server with the path and port 8090
    server = Server(path_to_browsermobproxy
                    + "browsermob-proxy", options={'port': 8090})
    server.start()
  
    # Create the proxy with following parameter as true
    proxy = server.create_proxy(params={"trustAllServers": "true"})
  
    # Create the webdriver object and pass the arguments
    options = webdriver.ChromeOptions()
  
    # Chrome will start in Headless mode
    options.add_argument('headless')
  
    # Ignores any certificate errors if there is any
    options.add_argument("--ignore-certificate-errors")
  
    # Setting up Proxy for chrome
    options.add_argument("--proxy-server={0}".format(proxy.proxy))
  
    # Startup the chrome webdriver with executable path and
    # the chrome options as parameters.
    driver = webdriver.Chrome(executable_path="C:/chromedriver.exe",
                              chrome_options=options)
  
    # Create a new HAR file of the following domain
    # using the proxy.
    proxy.new_har("geeksforgeeks.org/")
  
    # Send a request to the website and let it load
    driver.get("https://www.geeksforgeeks.org/")
  
    # Sleeps for 10 seconds
    time.sleep(10)
  
    # Write it to a HAR file.
    with open("network_log1.har", "w", encoding="utf-8") as f:
        f.write(json.dumps(proxy.har))
  
    print("Quitting Selenium WebDriver")
    driver.quit()
  
    # Read HAR File and parse it using JSON
    # to find the urls containing images.
    har_file_path = "network_log1.har"
    with open(har_file_path, "r", encoding="utf-8") as f:
        logs = json.loads(f.read())
  
    # Store the network logs from 'entries' key and
    # iterate them
    network_logs = logs['log']['entries']
    for log in network_logs:
  
        # Except block will be accessed if any of the
        # following keys are missing
        try:
            # URL is present inside the following keys
            url = log['request']['url']
  
            # Checks if the extension is .png or .jpg
            if url[len(url)-4:] == '.png' or url[len(url)-4:] == '.jpg':
                print(url, end="\n\n")
        except Exception as e:
            # print(e)
            pass

Output:

The image URL’s are highlighted above.

network_log1.har containing the image URL’s


Article Tags :