Open In App

HTML Entity Parser

Last Updated : 01 Apr, 2024
Improve
Improve
Like Article
Like
Save
Share
Report

Given a string str which has various HTML Entities in it, the task is to replace these entities with their corresponding special character.

HTML entity parser is the parser that takes HTML code as input and replaces all the entities of the special characters by the characters itself. The special characters and their entities for HTML are Quotation Mark: the entity is “, and symbol character is “.

Below is the HTML Entities with their corresponding special characters are shown in the table below:

Name/ DescriptionHTML EntitySpecial Character
Space  
Ampersand&&
Greater than>>
Less than&lt;<
Single Quotation Mark&apos;
Double Quotation Mark&quot;
Trademark&reg;®
Copyright mark&copy;©
Forward Slash&frasl;?

Examples:

Input: str = “17 &gt; 25 and 25 &lt; 17” 
Output: 17 > 25 and 25 < 17 
Explanation: In the above example &gt; is replaced by corresponding special character > and &lt; is replaced by < 

Input: str = “&copy; is symbol of copyright” 
Output: © is symbol of copyright 
Explanation: In the above example &copy; is replaced by corresponding special character ©

Method 1 – using unordered_map: Below are the steps:

  1. Store the HTML Entity with their character in a Map.
  2. Traverse the given string and if any character ‘&’ is encountered then find which HTML Entity is present after this ampersand.
  3. Add the corresponding character with the Entity in the output string.
  4. Print the output string as the result.

Below is the implementation of the above approach: 

Java
import java.util.HashMap;
import java.util.Map;
// Java program for the above approach
public class HtmlEntityParser {
  public static Map<String, String> map = new HashMap<>();

  static {
    // Associating html entity with
    // special character
    map.put("&quot;", "\"");
    map.put("&apos;", "'");
    map.put("&amp;", "&");
    map.put("&gt;", ">");
    map.put("&lt;", "<");
    map.put("&frasl;", "/");
    map.put("&nbsp;", " ");
    map.put("&reg;", "®");
    map.put("&copy;", "©");
  }
     // Function that convert the given
    // HTML Entity to its parsed String
  public static String parseInputString(String input) {
    // Output string
    StringBuilder output = new StringBuilder();
    // Traverse the input string
    for (int i = 0; i < input.length(); i++) {
      // If any ampersand is occurred
      if (input.charAt(i) == '&') {
        StringBuilder buffer = new StringBuilder();
        while (i < input.length()) {
          buffer.append(input.charAt(i));
          // If any ampersand is occurred
          if (input.charAt(i) == ';' && map.containsKey(buffer.toString())) {
            // Append the parsed
            // character
            output.append(map.get(buffer.toString()));
            buffer.setLength(0);
            i++;
            break;
          } else {
            i++;
          }
        }
        if (i >= input.length()) {
          output.append(buffer);
          break;
        }
        i--;
      } else {
        output.append(input.charAt(i));
      }
    }
    // convert the string builder into string
    // return the answer.
    return output.toString();
  }

  public static void main(String[] args) {
    // Initialize the parse string
    String input = "17 &gt; 25 and 25 &lt; 17";
    // Function call
    System.out.println(parseInputString(input));
  }
}
C#
// C# program for the above approach

using System;
using System.Collections.Generic;

public class HtmlEntityParser {
    public static Dictionary<string, string> map = new Dictionary<string, string>()
    {
        // Associating html entity with
        // special character
        { "&quot;", "\"" },
        { "&apos;", "'" },
        { "&amp;", "&" },
        { "&gt;", ">" },
        { "&lt;", "<" },
        { "&frasl;", "/" },
        { "&nbsp;", " " },
        { "&reg;", "®" },
        { "&copy;", "©" }
    };

    // Function that convert the given
    // HTML Entity to its parsed String
    public static string ParseInputString(string input)
    {
        // Output string
        var output = new System.Text.StringBuilder();

        // Traverse the input string
        for (int i = 0; i < input.Length; i++)
        {
            // If any ampersand is occurred
            if (input[i] == '&')
            {
                var buffer = new System.Text.StringBuilder();
                while (i < input.Length)
                {
                    buffer.Append(input[i]);
                    // If any ampersand is occurred
                    if (input[i] == ';' && map.ContainsKey(buffer.ToString()))
                    {
                        // Append the parsed
                        // character
                        output.Append(map[buffer.ToString()]);
                        buffer.Length = 0;
                        i++;
                        break;
                    }
                    else
                    {
                        i++;
                    }
                }
                if (i >= input.Length)
                {
                    output.Append(buffer);
                    break;
                }
                i--;
            }
            else
            {
                output.Append(input[i]);
            }
        }
        // convert the string builder into string
        // return the answer.
        return output.ToString();
    }

    public static void Main(string[] args)
    {
        // Initialize the parse string
        string input = "17 &gt; 25 and 25 &lt; 17";
        // Function call
        Console.WriteLine(ParseInputString(input));
    }
}
// Contributed by adityasharmadev01
Javascript
// JavaScript program for the above approach
class GfG {
  constructor() {
    this.m = {};
  }

  // Associating html entity with special character
  initializeMap() {
    this.m["&quot;"] = "\"";
    this.m["&apos;"] = "'";
    this.m["&amp;"] = "&";
    this.m["&gt;"] = ">";
    this.m["&lt;"] = "<";
    this.m["&frasl;"] = "/";
    this.m["&nbsp;"] = " ";
    this.m["&reg;"] = "®";
    this.m["&copy;"] = "©";
  }

  // Function that convert the given
  // HTML Entity to its parsed String
  parseInputString(input) {
    // Output string
    let output = "";

    // Traverse the string
    let i = 0;
    while (i < input.length) {
      // If any ampersand is occurred
      if (input[i] === '&') {
        let buffer = "";

        while (i < input.length) {
          buffer += input[i];

          // If any semicolon is occurred
          if (input[i] === ';' && this.m[buffer]) {
            // Append the parsed character
            output += this.m[buffer];

            // Clear the buffer
            buffer = "";
            i++;
            break;
          } else {
            i++;
          }
        }

        if (i >= input.length) {
          output += buffer;
          break;
        }

        i--;
      } else {
        output += input[i];
      }

      i++;
    }

    // Return the parsed string
    return output;
  }
}

// Driver Code
  // Given String
  const input_str = "17 &gt; 25 and 25 &lt; 17";
  const g = new GfG();

  // Initialised parsed string
  g.initializeMap();

  // Function Call
  console.log(g.parseInputString(input_str));
C++
// C++ program for the above approach
#include <iostream>
#include <unordered_map>
using namespace std;

class GfG {
public:
    unordered_map<string, string> m;

public:
    // Associating html entity with
    // special character
    void initializeMap()
    {
        m["&quot;"] = "\"";
        m["&apos;"] = "'";
        m["&amp;"] = "&";
        m["&gt;"] = ">";
        m["&lt;"] = "<";
        m["&frasl;"] = "/";
        m["&nbsp;"] = " ";
        m["&reg;"] = "®";
        m["&copy;"] = "©";
    }

public:
    // Function that convert the given
    // HTML Entity to its parsed String
    string parseInputString(string input)
    {
        // Output string
        string output = "";

        // Traverse the string
        for (int i = 0;
             i < input.size(); i++) {

            // If any ampersand is occurred
            if (input[i] == '&') {

                string buffer;

                while (i < input.size()) {

                    buffer = buffer + input[i];

                    // If any ampersand is occurred
                    if (input[i] == ';'
                        && m.find(buffer)
                               != m.end()) {

                        // Append the parsed
                        // character
                        output = output
                                 + m[buffer];

                        // Clear the buffer
                        buffer = "";
                        i++;
                        break;
                    }
                    else {
                        i++;
                    }
                }

                if (i >= input.size()) {
                    output = output
                             + buffer;
                    break;
                }
                i--;
            }
            else {
                output = output
                         + input[i];
            }
        }

        // Return the parsed string
        return output;
    }
};

// Driver Code
int main()
{
    // Given String
    string input = "17 &gt; 25 and 25 &lt; 17";
    GfG g;

    // Initialised parsed string
    g.initializeMap();

    // Function Call
    cout << g.parseInputString(input);
    return 0;
}
Python3
# Python program for the above approach
class GfG:
    def __init__(self):
        self.m = {}

    # Associating html entity with special character
    def initializeMap(self):
        self.m["&quot;"] = "\""
        self.m["&apos;"] = "'"
        self.m["&amp;"] = "&"
        self.m["&gt;"] = ">"
        self.m["&lt;"] = "<"
        self.m["&frasl;"] = "/"
        self.m["&nbsp;"] = " "
        self.m["&reg;"] = "®"
        self.m["&copy;"] = "©"

    # Function that convert the given
    # HTML Entity to its parsed String
    def parseInputString(self, input):
        # Output string
        output = ""

        # Traverse the string
        i = 0
        while i < len(input):
            # If any ampersand is occurred
            if input[i] == '&':
                buffer = ""

                while i < len(input):
                    buffer = buffer + input[i]

                    # If any semicolon is occurred
                    if input[i] == ';' and buffer in self.m:
                        # Append the parsed character
                        output = output + self.m[buffer]

                        # Clear the buffer
                        buffer = ""
                        i += 1
                        break
                    else:
                        i += 1

                if i >= len(input):
                    output = output + buffer
                    break

                i -= 1
            else:
                output = output + input[i]

            i += 1

        # Return the parsed string
        return output

# Driver Code
if __name__ == '__main__':
    # Given String
    input_str = "17 &gt; 25 and 25 &lt; 17"
    g = GfG()

    # Initialised parsed string
    g.initializeMap()

    # Function Call
    print(g.parseInputString(input_str))
    
# Contributed by adityasha4x71

Output
17 > 25 and 25 < 17

Time Complexity: O(N) 
Auxiliary Space: O(N) 

Method 2 – using Pattern Matching: Below are the steps:

  1. Traverse the given string str.
  2. While traversing, if any character ‘&’ is encountered then find which HTML Entity is present after this ampersand.
  3. Add the corresponding character with the Entity in the output string from the above table of matched character in the above table.
  4. Print the output string as the result after traversing the above string.

Below is the implementation of the above approach: 

Java
public class GfG {
    public String parseInputString(String input) {

        // To store parsed string
        StringBuilder output = new StringBuilder();

        int i = 0;
        while (i < input.length()) {

            // Matching pattern of html entity
            if (input.charAt(i) == '&') {
                StringBuilder buffer = new StringBuilder();

                while (i < input.length()) {
                    buffer.append(input.charAt(i));

                    if (input.charAt(i) == ';') {
                        if (buffer.toString().equals("&quot;")) {
                            output.append("\"");
                            buffer = new StringBuilder();
                            i++;
                            break;
                        } else if (buffer.toString().equals("&apos;")) {
                            output.append("'");
                            buffer = new StringBuilder();
                            i++;
                            break;
                        } else if (buffer.toString().equals("&amp;")) {
                            output.append("&");
                            buffer = new StringBuilder();
                            i++;
                            break;
                        } else if (buffer.toString().equals("&gt;")) {
                            output.append(">");
                            buffer = new StringBuilder();
                            i++;
                            break;
                        } else if (buffer.toString().equals("&lt;")) {
                            output.append("<");
                            buffer = new StringBuilder();
                            i++;
                            break;
                        } else if (buffer.toString().equals("&frasl;")) {
                            output.append("/");
                            buffer = new StringBuilder();
                            i++;
                            break;
                        } else if (buffer.toString().equals("&nbsp;")) {
                            output.append(" ");
                            buffer = new StringBuilder();
                            i++;
                            break;
                        } else if (buffer.toString().equals("&reg;")) {
                            output.append("®");
                            buffer = new StringBuilder();
                            i++;
                            break;
                        } else if (buffer.toString().equals("&copy;")) {
                            output.append("©");
                            buffer = new StringBuilder();
                            i++;
                            break;
                        } else {
                            i++;
                        }
                    } else {
                        i++;
                    }
                }

                if (i >= input.length()) {
                    output.append(buffer);
                    break;
                }
                i--;
            } else {
                output.append(input.charAt(i));
            }
            i++;
        }

        // Return the parsed string
        return output.toString();
    }

    public static void main(String[] args) {
        // Given String
        String input = "17 &gt; 25 and 25 &lt; 17";
        GfG g = new GfG();

        // Function Call
        System.out.println(g.parseInputString(input));
    }
}
C#
using System;

class GfG
{
    // Function to parse the input string containing HTML entities
    public string ParseInputString(string input)
    {
        // To store the parsed string
        string output = "";

        // Loop through the input string
        for (int i = 0; i < input.Length; i++)
        {
            // Matching pattern of HTML entity
            if (input[i] == '&')
            {
                string buffer = "";

                // Loop until the end of the input string
                while (i < input.Length)
                {
                    buffer += input[i];

                    // Check match for (&quot;)
                    if (input[i] == ';' && buffer == "&quot;")
                    {
                        output += "\"";
                        buffer = "";
                        i++;
                        break;
                    }
                    // Check match for (&apos;)
                    else if (input[i] == ';' && buffer == "&apos;")
                    {
                        output += "'";
                        buffer = "";
                        i++;
                        break;
                    }
                    // Check match for (&amp;)
                    else if (input[i] == ';' && buffer == "&amp;")
                    {
                        output += "&";
                        buffer = "";
                        i++;
                        break;
                    }
                    // Check match for (&gt;)
                    else if (input[i] == ';' && buffer == "&gt;")
                    {
                        output += ">";
                        buffer = "";
                        i++;
                        break;
                    }
                    // Check match for (&lt;)
                    else if (input[i] == ';' && buffer == "&lt;")
                    {
                        output += "<";
                        buffer = "";
                        i++;
                        break;
                    }
                    // Check match for (&frasl;)
                    else if (input[i] == ';' && buffer == "&frasl;")
                    {
                        output += "/";
                        buffer = "";
                        i++;
                        break;
                    }
                    // If no match is found, increment index
                    else
                    {
                        i++;
                    }
                }

                // If reached end of input, add buffer to output
                if (i >= input.Length)
                {
                    output += buffer;
                    break;
                }
                i--;
            }
            else
            {
                // If no match is found, add character to output
                output += input[i];
            }
        }

        // Return the parsed string
        return output;
    }
}

class Program
{
    static void Main(string[] args)
    {
        // Given String
        string input = "17 &gt; 25 and 25 &lt; 17";

        // Create an instance of GfG class
        GfG g = new GfG();

        // Function Call
        Console.WriteLine(g.ParseInputString(input));
    }
}
Javascript
function parseInputString(input) {
  // To store parsed string
  let output = '';

  let i = 0;
  while (i < input.length) {
    // Matching pattern of html entity
    if (input.charAt(i) === '&') {
      let buffer = '';

      while (i < input.length) {
        buffer += input.charAt(i);

        if (input.charAt(i) === ';') {
          switch (buffer) {
            case '&quot;':
              output += '"';
              break;
            case '&apos;':
              output += "'";
              break;
            case '&amp;':
              output += "&";
              break;
            case '&gt;':
              output += ">";
              break;
            case '&lt;':
              output += "<";
              break;
            case '&frasl;':
              output += "/";
              break;
            case '&nbsp;':
              output += " ";
              break;
            case '&reg;':
              output += "®";
              break;
            case '&copy;':
              output += "©";
              break;
            default:
              i++;
              continue;
          }
          i++;
          break;
        } else {
          i++;
        }
      }

      if (i >= input.length) {
        output += buffer;
        break;
      }
      i--;
    } else {
      output += input.charAt(i);
    }
    i++;
  }

  // Return the parsed string
  return output;
}

// Given String
let input = "17 &gt; 25 and 25 &lt; 17";

// Function Call
console.log(parseInputString(input));
C++
#include <iostream>
using namespace std;

class GfG {
public:
    // Function to parse the input string
    string parseInputString(string input) {
        // To store parsed string
        string output = "";

        // Loop through the input string
        for (int i = 0; i < input.size(); i++) {
            // Matching pattern of html entity
            if (input[i] == '&') {
                string buffer;

                // Loop until the end of the input string
                while (i < input.size()) {
                    buffer = buffer + input[i];

                    // Check match for (&quot;)
                    if (input[i] == ';' && buffer == "&quot;") {
                        output = output + "\"";
                        buffer = "";
                        i++;
                        break;
                    }

                    // Check match for (&apos;)
                    else if (input[i] == ';' && buffer == "&apos;") {
                        output = output + "'";
                        buffer = "";
                        i++;
                        break;
                    }

                    // Check match for (&amp;)
                    else if (input[i] == ';' && buffer == "&amp;") {
                        output = output + "&";
                        buffer = "";
                        i++;
                        break;
                    }

                    // Check match for (&gt;)
                    else if (input[i] == ';' && buffer == "&gt;") {
                        output = output + ">";
                        buffer = "";
                        i++;
                        break;
                    }

                    // Check match for (&lt;)
                    else if (input[i] == ';' && buffer == "&lt;") {
                        output = output + "<";
                        buffer = "";
                        i++;
                        break;
                    }

                    // Check match for (&frasl;)
                    else if (input[i] == ';' && buffer == "&frasl;") {
                        output = output + "/";
                        buffer = "";
                        i++;
                        break;
                    }

                    // If no match is found, increment index
                    else {
                        i++;
                    }
                }

                // If reached end of input, add buffer to output
                if (i >= input.size()) {
                    output = output + buffer;
                    break;
                }
                i--;
            }
            else {
                // If no match is found, add character to output
                output = output + input[i];
            }
        }

        // Return the parsed string
        return output;
    }
};

// Driver Code
int main() {
    // Given String
    string input = "17 &gt; 25 and 25 &lt; 17";
    GfG g;

    // Function Call
    cout << g.parseInputString(input);
    return 0;
}
Python3
class GfG:
    def __init__(self):
        # Dictionary to store HTML entities and their corresponding characters
        self.html_entities = {
            '"': "&quot;",
            "'": "&apos;",
            "&": "&amp;",
            ">": "&gt;",
            "<": "&lt;",
            "/": "&frasl;",
            " ": "&nbsp;",
            "®": "&reg;",
            "©": "&copy;"
        }

    def parse_input_string(self, input_str):
        """
        Parses the input string by replacing HTML entities with their corresponding characters.

        :param input_str: Input string containing HTML entities
        :return: Parsed string with replaced characters
        """
        output = ""
        i = 0

        while i < len(input_str):
            # Matching pattern of HTML entity
            if input_str[i] == '&':
                buffer = ""

                while i < len(input_str):
                    buffer += input_str[i]

                    # Check for each HTML entity and replace with corresponding character
                    if input_str[i] == ';' and buffer in self.html_entities.values():
                        output += [key for key, value in self.html_entities.items() if value == buffer][0]
                        buffer = ""
                        i += 1
                        break
                    else:
                        i += 1

                # If end of string is reached
                if i >= len(input_str):
                    output += buffer
                    break
                i -= 1
            else:
                output += input_str[i]
            i += 1

        # Return the parsed string
        return output


# Driver Code
if __name__ == "__main__":
    # Given String
    input_str = "17 &gt; 25 and 25 &lt; 17"
    gfg = GfG()

    # Function Call
    print(gfg.parse_input_string(input_str))

Output
17 > 25 and 25 < 17

Time Complexity: O(N) 
Auxiliary Space: O(N) 

Method 3 – using Regular Expression: Below are the steps:

  1. Store all the expression with it’s mapped value in a Map M.
  2. For each key in the map, create a regular expression using:

regex e(key);

  1. Now replace the above regular expression formed with it’s mapped value in the Map M as:

regex_replace(str, e, value); where, str is the input string, e is the expression formed in the above step, and val is the value mapped with expression e in the Map

  1. Repeat the above steps until all expression are not replaced.

Below is the implementation of the above approach: 

Java
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class MainClass {

    // Given Expression with mapped value
    static Map<String, String> m = new HashMap<String, String>() {{
        put("&quot;", "\"");
        put("&apos;", "'");
        put("&amp;", "&");
        put("&gt;", ">");
        put("&lt;", "<");
        put("&frasl;", "/");
    }};

    // Function that converts the given
    // HTML Entity to its parsed String
    static String parseInputString(String input) {
        for (Map.Entry<String, String> entry : m.entrySet()) {
            // Create Regex Pattern
            String pattern = Pattern.quote(entry.getKey());

            // Replace the above pattern
            // with mapped value using
            // Matcher and replaceAll
            input = input.replaceAll(pattern, entry.getValue());
        }

        // Return the parsed string
        return input;
    }

    // Driver Code
    public static void main(String[] args) {
        // Given String
        String input = "17 &gt; 25 and 25 &lt; 17";

        // Function Call
        System.out.println(parseInputString(input));
    }
}
Python
import re
import sys

# Dictionary mapping HTML entities to their respective characters
m = {
    "&quot;": "\"",
    "&apos;": "'",
    "&amp;": "&",
    "&gt;": ">",
    "&lt;": "<",
    "&frasl;": "/"
}

# Function that converts the given HTML Entity to its parsed String
def parse_input_string(input_str):
    for key, value in m.items():
        # Create Regex Pattern
        pattern = re.escape(key)
        
        # Replace the above pattern with mapped value using re.sub
        input_str = re.sub(pattern, value, input_str)
    
    # Return the parsed string
    return input_str

if __name__ == "__main__":
    # Get input string from command line arguments
    if len(sys.argv) > 1:
        input_str = ' '.join(sys.argv[1:])
    else:
        # Given default input string
        input_str = "17 &gt; 25 and 25 &lt; 17"
    
    # Function Call
    print(parse_input_string(input_str))
C#
using System;
using System.Text.RegularExpressions;
using System.Collections.Generic;

public class MainClass
{
    // Given Expression with mapped value
    static Dictionary<string, string> m = new Dictionary<string, string>
    {
        { "&quot;", "\"" },
        { "&apos;", "'" },
        { "&amp;", "&" },
        { "&gt;", ">" },
        { "&lt;", "<" },
        { "&frasl;", "/" }
    };

    // Function that converts the given
    // HTML Entity to its parsed String
    static string ParseInputString(string input)
    {
        foreach (var kvp in m)
        {
            // Create Regex Pattern
            string pattern = Regex.Escape(kvp.Key);

            // Replace the above pattern
            // with mapped value using
            // Regex.Replace
            input = Regex.Replace(input, pattern, kvp.Value);
        }

        // Return the parsed string
        return input;
    }

    // Driver Code
    public static void Main(string[] args)
    {
        // Given String
        string input = "17 &gt; 25 and 25 &lt; 17";

        // Function Call
        Console.WriteLine(ParseInputString(input));
    }
}
JavaScript
// Given Expression with mapped value
const m = {
    "&quot;": "\"", // HTML entity for double quote
    "&apos;": "'",  // HTML entity for single quote
    "&amp;": "&",   // HTML entity for ampersand
    "&gt;": ">",    // HTML entity for greater than
    "&lt;": "<",    // HTML entity for less than
    "/": "/"        // HTML entity for slash
};

// Function that converts the given
// HTML Entity to its parsed String
function parseInputString(input) {
    for (const [key, value] of Object.entries(m)) {
        // Create RegExp object with global flag to replace all occurrences
        const re = new RegExp(key, 'g');
        // Replace the HTML entity with its corresponding value using replaceAll method
        input = input.replaceAll(re, value);
    }
    // Return the parsed string
    return input;
}

// Driver Code
const input = "17 &gt; 25 and 25 &lt; 17";
// Function Call
console.log(parseInputString(input));
C++
#include <iostream>
#include <regex>
#include <unordered_map>
using namespace std;

// Given Expression with mapped value
const unordered_map<string, string> m = {
    { "&quot;", "\"" }, // HTML entity for double quote
    { "&apos;", "'" },  // HTML entity for single quote
    { "&amp;", "&" },   // HTML entity for ampersand
    { "&gt;", ">" },    // HTML entity for greater than
    { "&lt;", "<" },    // HTML entity for less than
    { "/", "/" }  // HTML entity for slash
};

// Function that converts the given
// HTML Entity to its parsed String
string parseInputString(string input)
{
    for (auto& it : m) {

        // Create ReGex Expression
        regex e(it.first);

        // Replace the above expression
        // with mapped value using
        // regex_replace()
        input = regex_replace(input, e, it.second);
    }

    // Return the parsed string
    return input;
}

// Driver Code
int main()
{
    // Given String
    string input = "17 &gt; 25 and 25 &lt; 17";

    // Function Call
    cout << parseInputString(input);
    return 0;
}

Output
17 > 25 and 25 < 17

Time Complexity: O(N) 
Auxiliary Space: O(N)



Like Article
Suggest improvement
Previous
Next
Share your thoughts in the comments

Similar Reads