When we call nlp, spacy tokenizes the text and creates a document object.
# !pip install spacy tabulateimport spacyfrom spacy import displacyfrom tabulate import tabulatenlp = spacy.load('en_core_web_md')doc = nlp("Apple is looking at buying a UK startup for $1 billion")# lets see all the tokensprint(f"Tokens in text: {[t for t in doc]}")print(f"First Token: {doc[0]}")print(f"Last Token: {doc[-1]}")
Tokens in text: [Apple, is, looking, at, buying, a, UK, startup, for, $, 1, billion]
First Token: Apple
Last Token: billion
# create a slice of the documentlooking_to_buy = doc[2:5]print(f"Slice: {looking_to_buy}")
Lets create a custom matcher to identify which franchise a movie belongs to.
from spacy.matcher import PhraseMatcherfrom spacy.tokenizer import Tokenizernlp = spacy.load("en_core_web_sm")bond_movies = ["Casino Royale", "Quantum of Solace", "Skyfall", "Spectre", "No Time To Die"]star_wars_movies = ["The Phantom Menace", "Attack of the Clones", "Revenge of the Sith", "A New Hope", "The Force Awakens", "The Last Jedi", "The Rise of Skywalker"]# Create PhraseMatcher and add patternsmatcher = PhraseMatcher(nlp.vocab)bond_patterns = [nlp.make_doc(text) for text in bond_movies]star_wars_patterns = [nlp.make_doc(text) for text in star_wars_movies]matcher.add("BOND_MOVIE", bond_patterns)matcher.add("STAR_WARS_MOVIE", star_wars_patterns)# Process textstexts = ["I watched No Time To Die last night in India. Great movie!","The Last Jedi is an American movie. I watched it in 2019."]for text in texts: doc = nlp(text) matches = matcher(doc)for match_id, start, end in matches: span = doc[start:end]print(f"Entity: '{span.text}'\t Label: '{nlp.vocab.strings[match_id]}'")
Entity: 'No Time To Die' Label: 'BOND_MOVIE'
Entity: 'The Last Jedi' Label: 'STAR_WARS_MOVIE'