Bengali Natural Language Processing(BNLP) Toolkit

4 min readNov 29, 2019

BNLP is a natural language processing toolkit for Bengali Language. This tool will help you to tokenize Bengali text, Embedding Bengali words, Bengali POS Tagging, Bengali Name Entity Recognition, Construct Neural Model for Bengali NLP purposes.

NB: This post is about https://github.com/sagorbrur/bnlp

Installation

pip install bnlp_toolkit

Pretrained Model

Tokenization

Basic Tokenizer

from bnlp import BasicTokenizer
basic_tokenizer = BasicTokenizer()
raw_text = "আমি বাংলায় গান গাই।"
tokens = basic_tokenizer.tokenize(raw_text)
print(tokens)# output: ["আমি", "বাংলায়", "গান", "গাই", "।"]

NLTK Tokenization

from bnlp import NLTKTokenizerbnltk = NLTKTokenizer()
text = "আমি ভাত খাই। সে বাজারে যায়। তিনি কি সত্যিই ভালো মানুষ?"
word_tokens = bnltk.word_tokenize(text)
sentence_tokens = bnltk.sentence_tokenize(text)
print(word_tokens)
print(sentence_tokens)# output
# word_token: ["আমি", "ভাত", "খাই", "।", "সে", "বাজারে", "যায়", "।", "তিনি", "কি", "সত্যিই", "ভালো", "মানুষ", "?"]
# sentence_token: ["আমি ভাত খাই।", "সে বাজারে যায়।", "তিনি কি সত্যিই ভালো মানুষ?"]

Bengali Sentncepiece Tokenizer

Tokenize using sentencepiece trained model

from bnlp import SentencepieceTokenizer

bsp = SentencepieceTokenizer()
model_path = "./model/bn_spm.model"
input_text = "আমি ভাত খাই। সে বাজারে যায়।"
tokens = bsp.tokenize(model_path, input_text)
print(tokens)
text2id = bsp.text2id(model_path, input_text)
print(text2id)
id2text = bsp.id2text(model_path, text2id)
print(id2text)

Training SentencePiece with our custom raw data

from bnlp import SentencepieceTokenizer

bsp = SentencepieceTokenizer()
data = "raw_text.txt"
model_prefix = "test"
vocab_size = 5
bsp.train(data, model_prefix, vocab_size)

Word Embedding

Bengali Word2Vec

Generate Vector using pre-train model

from bnlp import BengaliWord2Vec

bwv = BengaliWord2Vec()
model_path = "bengali_word2vec.model"
word = 'গ্রাম'
vector = bwv.generate_word_vector(model_path, word)
print(vector.shape)
print(vector)

Find Most Similar Word Using Pretrained Model

from bnlp import BengaliWord2Vec

bwv = BengaliWord2Vec()
model_path = "bengali_word2vec.model"
word = 'গ্রাম'
similar = bwv.most_similar(model_path, word, topn=10)
print(similar)

Train Bengali Word2Vec with your own data

Train Bengali word2vec with your custom raw data or tokenized sentences.

custom tokenized sentence format example:

sentences = [['আমি', 'ভাত', 'খাই', '।'], ['সে', 'বাজারে', 'যায়', '।']]

Check gensim word2vec api for details of training parameter

from bnlp import BengaliWord2Vec
bwv = BengaliWord2Vec()
data_file = "raw_text.txt" # or you can pass custom sentence tokens as list of list
model_name = "test_model.model"
vector_name = "test_vector.vector"
bwv.train(data_file, model_name, vector_name, epochs=5)

Pre-train or resume word2vec training with same or new corpus or tokenized sentences

from bnlp import BengaliWord2Vec
bwv = BengaliWord2Vec()

trained_model_path = "mytrained_model.model"
data_file = "raw_text.txt"
model_name = "test_model.model"
vector_name = "test_vector.vector"
bwv.pretrain(trained_model_path, data_file, model_name, vector_name, epochs=5)

Bengali FastText

To use fasttext you need to install fasttext manually by pip install fasttext==0.9.2

Generate Vector Using Pretrained Model

from bnlp.embedding.fasttext import BengaliFasttext

bft = BengaliFasttext()
word = "গ্রাম"
model_path = "bengali_fasttext_wiki.bin"
word_vector = bft.generate_word_vector(model_path, word)
print(word_vector.shape)
print(word_vector)

Train Bengali FastText Model

Check fasttext documentation for details of training parameter

from bnlp.embedding.fasttext import BengaliFasttext

bft = BengaliFasttext()
data = "raw_text.txt"
model_name = "saved_model.bin"
epoch = 50
bft.train(data, model_name, epoch)

Generate Vector File from Fasttext Binary Model

from bnlp.embedding.fasttext import BengaliFasttext

bft = BengaliFasttext()

model_path = "mymodel.bin"
out_vector_name = "myvector.txt"
bft.bin2vec(model_path, out_vector_name)

Bengali GloVe Word Vectors

from bnlp import BengaliGlove
glove_path = "bn_glove.39M.100d.txt"
word = "গ্রাম"
bng = BengaliGlove()
res = bng.closest_word(glove_path, word)
print(res)
vec = bng.word2vec(glove_path, word)
print(vec)

Bengali POS Tagging

Find Pos Tag Using Pretrained Model

from bnlp import POS
bn_pos = POS()
model_path = "model/bn_pos.pkl"
text = "আমি ভাত খাই।"
res = bn_pos.tag(model_path, text)
print(res)
# [('আমি', 'PPR'), ('ভাত', 'NC'), ('খাই', 'VM'), ('।', 'PU')]

Train POS Tag Model

from bnlp import POS
bn_pos = POS()
model_name = "pos_model.pkl"
tagged_sentences = [[('রপ্তানি', 'JJ'), ('দ্রব্য', 'NC'), ('-', 'PU'), ('তাজা', 'JJ'), ('ও', 'CCD'), ('শুকনা', 'JJ'), ('ফল', 'NC'), (',', 'PU'), ('আফিম', 'NC'), (',', 'PU'), ('পশুচর্ম', 'NC'), ('ও', 'CCD'), ('পশম', 'NC'), ('এবং', 'CCD'),('কার্পেট', 'NC'), ('৷', 'PU')], [('মাটি', 'NC'), ('থেকে', 'PP'), ('বড়জোর', 'JQ'), ('চার', 'JQ'), ('পাঁচ', 'JQ'), ('ফুট', 'CCL'), ('উঁচু', 'JJ'), ('হবে', 'VM'), ('৷', 'PU')]]

bn_pos.train(model_name, tagged_sentences)

Bengali Name Entity Recognition

Find NER Using Pretrained Model

from bnlp import NER
bn_ner = NER()
model_path = "model/bn_ner.pkl"
text = "সে ঢাকায় থাকে।"
result = bn_ner.tag(model_path, text)
print(result)
# [('সে', 'O'), ('ঢাকায়', 'S-LOC'), ('থাকে', 'O')]

Train NER Model

from bnlp import NER
bn_ner = NER()
model_name = "ner_model.pkl"
tagged_sentences = [[('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')], [('ত্রাণ', 'O'),('ও', 'O'),('সমাজকল্যাণ', 'O'),('সম্পাদক', 'S-PER'),('সুজিত', 'B-PER'),('রায়', 'I-PER'),('নন্দী', 'E-PER'),('প্রমুখ', 'O'),('সংবাদ', 'O'),('সম্মেলনে', 'O'),('উপস্থিত', 'O'),('ছিলেন', 'O')]]

bn_ner.train(model_name, tagged_sentences)

Bengali Corpus Class

Stopwords and Punctuations

from bnlp.corpus import stopwords, punctuations, letters, digits

print(stopwords)
print(punctuations)
print(letters)
print(digits)

Remove stopwords from Text

from bnlp.corpus import stopwords
from bnlp.corpus.util import remove_stopwords

raw_text = 'আমি ভাত খাই।' 
result = remove_stopwords(raw_text, stopwords)
print(result)
# ['ভাত', 'খাই', '।']