Named Entity Recognition From Wikipedia article using Spacy

Image for post
Image for post
Photo by Julian Rivera on Unsplash

Steps

import wikipedia
import requests
import spacy
from collections import Counter
import matplotlib.pyplot as plt
import spacy
nlp = spacy.load('en_core_web_lg')

Search on a specific page

result = wikipedia.search("Krishna")
result
['Krishna',
'Krishna Krishna',
'Krishna Janmashtami',
'Krishna (Telugu actor)',
'Krishna Vamsi',
'Krishna Bhagavaan',
'International Society for Krishna Consciousness',
'Krishna-Krishna',
'Hare Krishna',
'Krishna (TV series)']
page = wikipedia.page(result[0], preload= True)
doc = nlp(page.content)#from spacy import displacy
page.url'https://en.wikipedia.org/wiki/Krishna'#displacy.serve(doc, style="ent")
max_token_display = 10 
for idx , token in enumerate(doc):
# Print the token and its part-of-speech tag
print(token.text, "-->", token.pos_, )
if idx > max_token_display:
break;
The --> DET
Mahābhārata --> PROPN
( --> PUNCT
US --> PROPN
: --> PUNCT
, --> PUNCT
UK --> PROPN
: --> PUNCT
; --> PUNCT
Sanskrit --> ADJ
: --> PUNCT
महाभारतम् --> X
for idx , ent in enumerate(doc.ents):
print(ent.text, ent.start_char, ent.end_char, ent.label_)
if idx>max_token_display:
break
Mahābhārata 4 15 PERSON
US 17 19 GPE
UK 23 25 GPE
Sanskrit 29 37 LANGUAGE
महाभारतम् 39 48 CARDINAL
Mahābhāratam 50 62 PERSON
two 108 111 CARDINAL
Sanskrit 118 126 NORP
India 144 149 GPE
Rāmāyaṇa 171 179 PERSON
two 214 217 CARDINAL
the Kurukshetra War 239 258 EVENT
persons  = [ent.text for ent in doc.ents if ent.label_=='PERSON' ]
person_count = Counter(persons)print(person_count){'Pandavas': 31, 'Krishna': 25, 'Mahābhārata': 24, 'Mahabharata': 23, 'Pandu': 17, 'Dhritarashtra': 15, 'Yudhishthira': 14, 'Bhishma': 11, 'Kunti': 11, 'Kaurava': 8, 'Satyavati': 6, 'Madri': 6, 'Gandhari': 6, 'Vyasa': 5, 'Kuru': 5, 'Pandava': 5, 'Vichitravirya': 5, 'Vidura': 5, 'Kauravas': 5, 'Rāmāyaṇa': 4, 'Bhima': 4, 'Draupadi': 4, 'Jain': 4, 'Gupta': 3, 'Janamejaya': 3, 'Jaya': 3, 'Minkowski': 3, 'Parikshit': 3, 'Devavrata': 3, 'Amba': 3, 'Karna': 3, 'Yama': 3, 'Yayati': 3, 'Jarasandha': 3, 'Motilal Banarsidass': 3, 'BCE': 2, 'Ugraśrava Sauti': 2, 'Vasu': 2, 'Oberlies': 2, 'Kālidāsa': 2, 'Mahapadma Nanda': 2, 'Adhisimakrishna': 2, 'Shakuni': 2, 'Dushasana': 2, 'Ghatotkacha': 2, 'J. L. Fitzgerald': 2, 'P. Lal': 2, 'Bibek Debroy': 2, 'Shyam Benegal': 2, 'Vasudeva': 2, 'Jaini': 2, 'Oldenberg': 2}
person_count = {k: v for k, v in sorted(person_count.items(), key=lambda item: item[1] , reverse=True) if v>1}print(person_count){'Pandavas': 31, 'Krishna': 25, 'Mahābhārata': 24, 'Mahabharata': 23, 'Pandu': 17, 'Dhritarashtra': 15, 'Yudhishthira': 14, 'Bhishma': 11, 'Kunti': 11, 'Kaurava': 8, 'Satyavati': 6, 'Madri': 6, 'Gandhari': 6, 'Vyasa': 5, 'Kuru': 5, 'Pandava': 5, 'Vichitravirya': 5, 'Vidura': 5, 'Kauravas': 5, 'Rāmāyaṇa': 4, 'Bhima': 4, 'Draupadi': 4, 'Jain': 4, 'Gupta': 3, 'Janamejaya': 3, 'Jaya': 3, 'Minkowski': 3, 'Parikshit': 3, 'Devavrata': 3, 'Amba': 3, 'Karna': 3, 'Yama': 3, 'Yayati': 3, 'Jarasandha': 3, 'Motilal Banarsidass': 3, 'BCE': 2, 'Ugraśrava Sauti': 2, 'Vasu': 2, 'Oberlies': 2, 'Kālidāsa': 2, 'Mahapadma Nanda': 2, 'Adhisimakrishna': 2, 'Shakuni': 2, 'Dushasana': 2, 'Ghatotkacha': 2, 'J. L. Fitzgerald': 2, 'P. Lal': 2, 'Bibek Debroy': 2, 'Shyam Benegal': 2, 'Vasudeva': 2, 'Jaini': 2, 'Oldenberg': 2}
fig = plt.gcf()
ax= plt.gca()
fig.set_size_inches(25.5, 25.5)
plt.barh(list(person_count.keys()), person_count.values())
plt.xticks(rotation=0, fontsize=40)
plt.yticks(rotation=0, fontsize=25)

for i, v in enumerate(person_count.values()):
ax.text(v + 2, i + 0, str(v), color='black' ,fontsize = 20)
plt.show()
png

Check for the other page

result = wikipedia.search("Jesus")
page = wikipedia.page(result[0], preload= True)
doc = nlp(page.content)
persons = [ent.text for ent in doc.ents if ent.label_=='PERSON' ]
person_count = Counter(persons)
person_count = {k: v for k, v in sorted(person_count.items(), key=lambda item: item[1] , reverse=True) if v>1}
fig = plt.gcf()
ax= plt.gca()
fig.set_size_inches(25.5, 25.5)
plt.barh(list(person_count.keys()), person_count.values())
plt.xticks(rotation=0, fontsize=40)
plt.yticks(rotation=0, fontsize=25)

for i, v in enumerate(person_count.values()):
ax.text(v + 2, i + 0, str(v), color='black' ,fontsize = 20)
plt.show()
png
result = wikipedia.search("Mahabharat")
page = wikipedia.page(result[0], preload= True)
doc = nlp(page.content)
persons = [ent.text for ent in doc.ents if ent.label_=='PERSON' ]
person_count = Counter(persons)
person_count = {k: v for k, v in sorted(person_count.items(), key=lambda item: item[1] , reverse=True) if v>1}
fig = plt.gcf()
ax= plt.gca()
fig.set_size_inches(25.5, 25.5)
plt.barh(list(person_count.keys()), person_count.values())
plt.xticks(rotation=0, fontsize=40)
plt.yticks(rotation=0, fontsize=25)

for i, v in enumerate(person_count.values()):
ax.text(v + 2, i + 0, str(v), color='black' ,fontsize = 20)
plt.show()
png

Create a function including all above

def plot_names_from_page(title = "Mahabharat"):
result = wikipedia.search(title)
page = wikipedia.page(result[0], preload= True)
doc = nlp(page.content)
persons = [ent.text for ent in doc.ents if ent.label_=='PERSON' ]
person_count = Counter(persons)
person_count = {k: v for k, v in sorted(person_count.items(), key=lambda item: item[1] , reverse=True) if v>1}
print(page.url)
fig = plt.gcf()
ax= plt.gca()
fig.set_size_inches(25.5, 25.5)
plt.barh(list(person_count.keys()), person_count.values())
plt.xticks(rotation=0, fontsize=40)
plt.yticks(rotation=0, fontsize=25)
#plt.title(page.url, fontdict={size:20})

for i, v in enumerate(person_count.values()):
ax.text(v + 2, i + 0, str(v), color='black' ,fontsize = 20)
plt.show()
plot_names_from_page('Illiad')
png
plot_names_from_page('Ramayan')
png
plot_names_from_page('World_War_I')https://en.wikipedia.org/wiki/World_War_I
png
plot_names_from_page('great depression')https://en.wikipedia.org/wiki/Great_Depression
png
plot_names_from_page('higgs boson')https://en.wikipedia.org/wiki/Higgs_boson
png

Written by

Data Scientist / Data Engineer

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store