Open In App

Extract URLs present in a given string

Last Updated : 07 Nov, 2023
Improve
Improve
Like Article
Like
Save
Share
Report

Given a string S, the task is to find and extract all the URLs from the string. If no URL is present in the string, then print “-1”.

Examples:

Input: S = “Welcome to https://www.geeksforgeeks.org Computer Science Portal”
Output: https://www.geeksforgeeks.org
Explanation:
The given string contains the URL ‘https://www.geeksforgeeks.org’.

Input: S = “Welcome to https://write.geeksforgeeks.org portal of https://www.geeksforgeeks.org Computer Science Portal”
Output:
https://write.geeksforgeeks.org 
https://www.geeksforgeeks.org
Explanation:
The given string contains two URLs ‘https://write.geeksforgeeks.org’ and ‘https://www.geeksforgeeks.org’.

Approach: The idea is to use Regular Expression to solve this problem. Follow the steps below to solve the given problem:

regex = “\\b((?:https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:, .;]*[-a-zA-Z0-9+&@#/%=~_|])”

  • Create an ArrayList in Java and compile the regular expression using Pattern.compile().
  • Match the given string with the regular expression. In Java, this can be done by using Pattern.matcher().
  • Find the substring from the first index of match result to the last index of the match result and add this substring into the list.
  • After completing the above steps, if the list is found to be empty, then print “-1” as there is no URL present in the string S. Otherwise, print all the string stored in the list.

Below is the implementation of the above approach:

C++




#include <iostream>
#include <regex>
#include <vector>
using namespace std;
 
// Function to extract all the URLs from the string
void extractURL(string str)
{
    // Creating an empty vector to store URLs
    vector<string> url_list;
 
    // Regular Expression to extract URLs from the string
    string regex_str = "\\b((?:https?|ftp|file):"
                       "\\/\\/[a-zA-Z0-9+&@#\\/%?=~_|!:,.;]*"
                       "[a-zA-Z0-9+&@#\\/%=~_|])";
 
    // Compile the Regular Expression
    regex r(regex_str, regex_constants::icase);
 
    // Find the match between string and the regular expression
    sregex_iterator m(str.begin(), str.end(), r);
    sregex_iterator m_end;
 
    // Find and store all the URLs in the vector
    while (m != m_end) {
        url_list.push_back(m->str());
        m++;
    }
 
    // If no URLs are found, print -1, otherwise print the URLs
    if (url_list.size() == 0) {
        cout << "-1" << endl;
    } else {
        for (string url : url_list) {
            cout << url << endl;
        }
    }
}
 
// Driver Code
int main()
{
    // Given String str
    string str = "Welcome to https://www.geeksforgeeks.org Computer Science Portal";
 
    // Function Call
    extractURL(str);
 
    return 0;
}


Java




import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
public class ExtractURL {
 
    // Function to extract all the URLs from the string
    public static void extractURL(String str) {
        // Creating an empty ArrayList to store URLs
        ArrayList<String> urlList = new ArrayList<>();
 
        // Regular Expression to extract URL from the string
        String regexStr = "\\b((?:https?|ftp|file):"
                + "\\/\\/[a-zA-Z0-9+&@#\\/%?=~_|!:,.;]*"
                + "[a-zA-Z0-9+&@#\\/%=~_|])";
 
        // Compile the Regular Expression pattern
        Pattern pattern = Pattern.compile(regexStr, Pattern.CASE_INSENSITIVE);
 
        // Create a Matcher that matches the pattern with the input string
        Matcher matcher = pattern.matcher(str);
 
        // Find and add all matching URLs to the ArrayList
        while (matcher.find()) {
            // Add the matched URL to the ArrayList
            urlList.add(matcher.group());
        }
 
        // If no URL is found, print -1
        if (urlList.isEmpty()) {
            System.out.println("-1");
        } else {
            // Print all the URLs stored in the ArrayList
            for (String url : urlList) {
                System.out.println(url);
            }
        }
    }
 
    public static void main(String[] args) {
        // Given String str
        String str = "Welcome to https://www.geeksforgeeks.org "
                + "Computer Science Portal";
 
        // Function Call
        extractURL(str);
    }
}


Python3




import re
 
def extractURL(str):
    # Creating an empty list
    url_list = []
     
    # Regular Expression to extract URL from the string
    regex = r'\b((?:https?|ftp|file):\/\/[-a-zA-Z0-9+&@#\/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#\/%=~_|])'
     
    # Compile the Regular Expression
    p = re.compile(regex, re.IGNORECASE)
     
    # Find the match between string and the regular expression
    m = p.finditer(str)
     
    # Find the next subsequence of the input subsequence that find the pattern
    for match in m:
        # Find the substring from the first index of match result to the last index of match result and add in the list
        url_list.append(str[match.start():match.end()])  # Corrected slicing here
     
    # IF there no URL present
    if len(url_list) == 0:
        print("-1")
        return
     
    # Print all the URLs stored
    for url in url_list:
        print(url)
 
# Driver Code
if __name__ == '__main__':
   
    # Given String str
    string = "Welcome to https://www.geeksforgeeks.org Computer Science Portal"
 
    # Function Call
    extractURL(string)


C#




using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
 
class Program
{
    static void ExtractURL(string str)
    {
        // Create an empty list to store URLs
        List<string> urlList = new List<string>();
 
        // Regular Expression to extract URLs from the string
        string regexStr = @"\b((https?|ftp|file)://[a-zA-Z0-9+&@#/%?=~_|!:,.;]*[a-zA-Z0-9+&@#/%=~_|])";
 
        // Compile the Regular Expression
        Regex regex = new Regex(regexStr, RegexOptions.IgnoreCase);
 
        // Find all matches in the string
        MatchCollection matches = regex.Matches(str);
 
        foreach (Match match in matches)
        {
            urlList.Add(match.Value);
        }
 
        // If there are no URLs present
        if (urlList.Count == 0)
        {
            Console.WriteLine("-1");
            return;
        }
 
        // Print all the URLs stored
        foreach (string url in urlList)
        {
            Console.WriteLine(url);
        }
    }
 
    static void Main()
    {
        // Given String str
        string str = "Welcome to https://www.geeksforgeeks.org Computer Science Portal";
 
        // Function Call
        ExtractURL(str);
    }
}


Javascript




// Function to extract all the URLs from the string
function extractURL(str) {
    // Creating an empty array to store URLs
    let urlList = [];
 
    // Regular Expression to extract URL from the string
    const regexStr = "\\b((?:https?|ftp|file):"  // Defines the URL pattern
        + "\\/\\/[a-zA-Z0-9+&@#\\/%?=~_|!:,.;]*"
        + "[a-zA-Z0-9+&@#\\/%=~_|])";
 
    // Compile the Regular Expression pattern
    const regex = new RegExp(regexStr, 'gi'); // 'g' flag for global match, 'i' for case-insensitive
 
    // Find and add all matching URLs to the array
    let match;
    while ((match = regex.exec(str)) !== null) { // Loop through matches and add to urlList
        urlList.push(match[0]);
    }
 
    // If no URL is found, print -1
    if (urlList.length === 0) {
        console.log("-1");
        return;
    }
 
    // Print all the URLs stored in the array
    for (let url of urlList) {
        console.log(url);
    }
}
 
// Given String str
const str = "Welcome to https://www.geeksforgeeks.org Computer Science Portal";
 
// Function Call
extractURL(str);


Output

https://www.geeksforgeeks.org

Time Complexity: O(N)
Auxiliary Space: O(1)



Like Article
Suggest improvement
Share your thoughts in the comments

Similar Reads