from enchant.tokenize import get_tokenizer
from enchant.tokenize import EmailFilter
text = "The email is abc@gmail.com"
tokenizer = get_tokenizer( "en_US" )
print ( "Printing tokens without filtering:" )
token_list = []
for words in tokenizer(text):
token_list.append(words)
print (token_list)
tokenizer_filter = get_tokenizer( "en_US" , [EmailFilter])
print ( "\nPrinting tokens after filtering:" )
token_list_filter = []
for words in tokenizer_filter(text):
token_list_filter.append(words)
print (token_list_filter)
|
Output :
Printing tokens without filtering:
[(‘The’, 0), (’email’, 4), (‘is’, 10), (‘abc’, 13), (‘gmail’, 17), (‘com’, 23)]
Printing tokens after filtering:
[(‘The’, 0), (’email’, 4), (‘is’, 10)
Example 2 : URLFilter
from enchant.tokenize import get_tokenizer
from enchant.tokenize import URLFilter
tokenizer = get_tokenizer( "en_US" )
print ( "Printing tokens without filtering:" )
token_list = []
for words in tokenizer(text):
token_list.append(words)
print (token_list)
tokenizer_filter = get_tokenizer( "en_US" , [URLFilter])
print ( "\nPrinting tokens after filtering:" )
token_list_filter = []
for words in tokenizer_filter(text):
token_list_filter.append(words)
print (token_list_filter)
|
Output :
Printing tokens without filtering:
[(‘This’, 0), (‘is’, 5), (‘an’, 8), (‘URL’, 11), (‘https’, 16), (‘www’, 24), (‘geeksforgeeks’, 28), (‘org’, 42)]
Printing tokens after filtering:
[(‘This’, 0), (‘is’, 5), (‘an’, 8), (‘URL’, 11)]
Example 3 : WikiWordFilter
A WikiWord is a word which consists of two or more words with initial capitals, run together.
from enchant.tokenize import get_tokenizer
from enchant.tokenize import WikiWordFilter
text = "VersionFiveDotThree is an example of WikiWord"
tokenizer = get_tokenizer( "en_US" )
print ( "Printing tokens without filtering:" )
token_list = []
for words in tokenizer(text):
token_list.append(words)
print (token_list)
tokenizer_filter = get_tokenizer( "en_US" , [WikiWordFilter])
print ( "\nPrinting tokens after filtering:" )
token_list_filter = []
for words in tokenizer_filter(text):
token_list_filter.append(words)
print (token_list_filter)
|
Output :
Printing tokens without filtering:
[(‘VersionFiveDotThree’, 0), (‘is’, 20), (‘an’, 23), (‘example’, 26), (‘of’, 34), (‘WikiWord’, 37)]
Printing tokens after filtering:
[(‘is’, 20), (‘an’, 23), (‘example’, 26), (‘of’, 34)]
Share your thoughts in the comments
Please Login to comment...