Open In App

Split a String into columns using regex in pandas DataFrame

Given some mixed data containing multiple values as a string, let’s see how can we divide the strings using regex and make multiple columns in Pandas DataFrame.

Method #1:
In this method we will use re.search(pattern, string, flags=0). Here pattern refers to the pattern that we want to search. It takes in a string with the following values:






# import the regex library
import pandas as pd
import re
  
# Create a list with all the strings
movie_data = ["Name: The_Godfather Year: 1972 Rating: 9.2",
            "Name: Bird_Box Year: 2018 Rating: 6.8",
            "Name: Fight_Club Year: 1999 Rating: 8.8"]
  
# Create a dictionary with the required columns 
# Used later to convert to DataFrame
movies = {"Name":[], "Year":[], "Rating":[]}
  
for item in movie_data:
      
    # For Name field
    name_field = re.search("Name: .*",item)
      
    if name_field is not None:
        name = re.search('\w*\s\w*',name_field.group())
    else:
        name = None
    movies["Name"].append(name.group())
      
    # For Year field
    year_field = re.search("Year: .*",item)
    if year_field is not None:
        year = re.search('\s\d\d\d\d',year_field.group())
    else:
        year = None
    movies["Year"].append(year.group().strip())
      
    # For rating field
    rating_field = re.search("Rating: .*",item)
    if rating_field is not None
        rating = re.search('\s\d.\d',rating_field.group())
    else
        rating - None
    movies["Rating"].append(rating.group().strip())
  
# Creating DataFrame
df = pd.DataFrame(movies)
print(df)

Output:

Explanation:



 
Method #2:
To break up the string we will use Series.str.extract(pat, flags=0, expand=True) function. Here pat refers to the pattern that we want to search for.




import pandas as pd
  
dict = {'movie_data':['The Godfather 1972 9.2',
                    'Bird Box 2018 6.8',
                    'Fight Club 1999 8.8'] }
  
# Convert the dictionary to a dataframe
df = pd.DataFrame(dict)
  
# Extract name from the string 
df['Name'] = df['movie_data'].str.extract('(\w*\s\w*)', expand=True)
   
# Extract year from the string 
df['Year'] = df['movie_data'].str.extract('(\d\d\d\d)', expand=True)
  
# Extract rating from the string 
df['Rating'] = df['movie_data'].str.extract('(\d\.\d)', expand=True)
print(df)

Output:


Article Tags :