Spacy 101

Published

September 6, 2024

Notes from https://spacy.io/usage/spacy-101

When we call nlp, spacy tokenizes the text and creates a document object.

# !pip install spacy tabulate
import spacy
from spacy import displacy
from tabulate import tabulate

nlp = spacy.load('en_core_web_md')
doc = nlp("Apple is looking at buying a UK startup for $1 billion")

# lets see all the tokens
print(f"Tokens in text: {[t for t in doc]}")
print(f"First Token: {doc[0]}")
print(f"Last Token: {doc[-1]}")
Tokens in text: [Apple, is, looking, at, buying, a, UK, startup, for, $, 1, billion]
First Token: Apple
Last Token: billion
# create a slice of the document
looking_to_buy = doc[2:5]
print(f"Slice: {looking_to_buy}")
Slice: looking at buying
# display entities
displacy.render(doc, style="ent")
Apple ORG is looking at buying a UK GPE startup for $1 billion MONEY
# show dependencies
displacy.render(doc, style="dep")
Apple PROPN is AUX looking VERB at ADP buying VERB a DET UK PROPN startup NOUN for ADP $ SYM 1 NUM billion NUM nsubj aux prep pcomp det compound dobj prep quantmod compound pobj
headers = ["text", "lemma", "pos", "tag", "dep", "shape", "is_alpha", "is_stop"]
data = [[token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop] for token in doc]

print(tabulate(data, headers=headers, tablefmt="grid"))
+---------+---------+-------+-------+----------+---------+------------+-----------+
| text    | lemma   | pos   | tag   | dep      | shape   | is_alpha   | is_stop   |
+=========+=========+=======+=======+==========+=========+============+===========+
| Apple   | Apple   | PROPN | NNP   | nsubj    | Xxxxx   | True       | False     |
+---------+---------+-------+-------+----------+---------+------------+-----------+
| is      | be      | AUX   | VBZ   | aux      | xx      | True       | True      |
+---------+---------+-------+-------+----------+---------+------------+-----------+
| looking | look    | VERB  | VBG   | ROOT     | xxxx    | True       | False     |
+---------+---------+-------+-------+----------+---------+------------+-----------+
| at      | at      | ADP   | IN    | prep     | xx      | True       | True      |
+---------+---------+-------+-------+----------+---------+------------+-----------+
| buying  | buy     | VERB  | VBG   | pcomp    | xxxx    | True       | False     |
+---------+---------+-------+-------+----------+---------+------------+-----------+
| a       | a       | DET   | DT    | det      | x       | True       | True      |
+---------+---------+-------+-------+----------+---------+------------+-----------+
| UK      | UK      | PROPN | NNP   | compound | XX      | True       | False     |
+---------+---------+-------+-------+----------+---------+------------+-----------+
| startup | startup | NOUN  | NN    | dobj     | xxxx    | True       | False     |
+---------+---------+-------+-------+----------+---------+------------+-----------+
| for     | for     | ADP   | IN    | prep     | xxx     | True       | True      |
+---------+---------+-------+-------+----------+---------+------------+-----------+
| $       | $       | SYM   | $     | quantmod | $       | False      | False     |
+---------+---------+-------+-------+----------+---------+------------+-----------+
| 1       | 1       | NUM   | CD    | compound | d       | False      | False     |
+---------+---------+-------+-------+----------+---------+------------+-----------+
| billion | billion | NUM   | CD    | pobj     | xxxx    | True       | False     |
+---------+---------+-------+-------+----------+---------+------------+-----------+

Named Entities

headers = ["text", "start", "end", "label"]
data = [[ent.text, ent.start, ent.end, ent.label_] for ent in doc.ents]
print(tabulate(data, headers=headers, tablefmt="grid"))
+------------+---------+-------+---------+
| text       |   start |   end | label   |
+============+=========+=======+=========+
| Apple      |       0 |     1 | ORG     |
+------------+---------+-------+---------+
| UK         |       6 |     7 | GPE     |
+------------+---------+-------+---------+
| $1 billion |       9 |    12 | MONEY   |
+------------+---------+-------+---------+

Spacy Pipeline

Custom matcher for NER

Lets create a custom matcher to identify which franchise a movie belongs to.

from spacy.matcher import PhraseMatcher
from spacy.tokenizer import Tokenizer

nlp = spacy.load("en_core_web_sm")

bond_movies = ["Casino Royale", "Quantum of Solace", "Skyfall", "Spectre", "No Time To Die"]
star_wars_movies = ["The Phantom Menace", "Attack of the Clones", "Revenge of the Sith", "A New Hope", 
                    "The Force Awakens", "The Last Jedi", "The Rise of Skywalker"]

# Create PhraseMatcher and add patterns
matcher = PhraseMatcher(nlp.vocab)
bond_patterns = [nlp.make_doc(text) for text in bond_movies]
star_wars_patterns = [nlp.make_doc(text) for text in star_wars_movies]

matcher.add("BOND_MOVIE", bond_patterns)
matcher.add("STAR_WARS_MOVIE", star_wars_patterns)

# Process texts
texts = ["I watched No Time To Die last night in India. Great movie!",
         "The Last Jedi is an American movie. I watched it in 2019."]

for text in texts:
    doc = nlp(text)
    matches = matcher(doc)
    for match_id, start, end in matches:
        span = doc[start:end]
        print(f"Entity: '{span.text}'\t Label: '{nlp.vocab.strings[match_id]}'")
Entity: 'No Time To Die'     Label: 'BOND_MOVIE'
Entity: 'The Last Jedi'  Label: 'STAR_WARS_MOVIE'