# Natural Language Processing using something like https://spacy.io # THIS WORKS REALLY WELL FOR WHAT WE NEED: https://huggingface.co/facebook/bart-large-mnli # Zero Shot Classification - Natrual Language Inference # basically this means we can list all the different skill names and the model will give us a # percentage probability that we are talking about each of them. We should be able to take the top # value and pass valid information from the query into the skill class after extracting information # using another model for tokenization. This took me an entire week of research to figure out -_- # this will allow us to figure out what the query means # i.e we might not have to add the word "wolfram" into a query to send it to wolfram... # import spacy # nlp = spacy.load("en_core_web_sm") # doc = nlp("What is the weather in toronto") # for token in doc.ents: # print(token) # # if token.like_num: # # print(tokenx) # from transformers import AutoTokenizer, AutoModelForSequenceClassification # tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli") # model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli") from transformers import pipeline import spacy from datetime import datetime import time # text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously." # doc = nlp(text) class NLP: def __init__(self, candidate_labels=[]): self.candidate_labels = candidate_labels self.classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli") self.tokenclass = spacy.load("en_core_web_sm") def get_skill(self, sentence): return self.classifier(sentence, self.candidate_labels) def get_named_entities(self, sentence): return [[ent.text, ent.start_char, ent.end_char, ent.label_] for ent in self.tokenclass(sentence).ents] # for ent in doc.ents: # print(ent.text, ent.start_char, ent.end_char, ent.label_) # return self.tokenclass(sentence) if __name__ == "__main__": starttime = time.time() nlp = NLP(['weather', 'timer', 'physics', 'mathematics']) print(f"Init: {time.time()-starttime}") starttime = time.time() print('==') print(nlp.get_skill("one day I will see the world")) print(f"Took: {time.time()-starttime}") starttime = time.time() print("yay!") print(nlp.get_skill("What is the weather today?")) print(f"Took: {time.time()-starttime}") starttime = time.time() print('==') print(nlp.get_skill("What is air resistance of a spaceship with a mass of 1000kg")) print(f"Took: {time.time()-starttime}") starttime = time.time() print(nlp.get_skill("What is five plus five")) print(f"Took: {time.time()-starttime}") starttime = time.time() print('====') print(nlp.get_named_entities("one day I will see the world")) print(f"Took: {time.time()-starttime}") starttime = time.time() print("yay!") print(nlp.get_named_entities("What is the weather today in london?")) print(f"Took: {time.time()-starttime}") starttime = time.time() print('====') print('======') print(nlp.get_named_entities("set a timer for 1 minute and 15 seconds")) print(f"Took: {time.time()-starttime}") starttime = time.time() print(nlp.get_named_entities("remind me at May 5th at 2:30 in the afternoon to wash the dog")) print(f"Took: {time.time()-starttime}") starttime = time.time() print(nlp.get_skill("remind me at May 5th at 2:30 in the afternoon to wash the dog")) print(f"Took: {time.time()-starttime}") starttime = time.time() print('======') # sequence_to_classify = "one day I will see the world" # candidate_labels = ['travel', 'cooking', 'dancing'] # print(classifier(sequence_to_classify, candidate_labels)) # import spacy # from spacy.matcher import Matcher # nlp = spacy.load("en_core_web_sm") # matcher = Matcher(nlp.vocab) # # Add match ID "HelloWorld" with no callback and one pattern # pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}] # matcher.add("HelloWorld", [pattern]) # doc = nlp("Hello, world! Hello world!") # matches = matcher(doc) # for match_id, start, end in matches: # string_id = nlp.vocab.strings[match_id] # Get string representation # span = doc[start:end] # The matched span # print(match_id, string_id, start, end, span.text)