jarvis/backend/NLP.py

# Natural Language Processing using something like https://spacy.io

# THIS WORKS REALLY WELL FOR WHAT WE NEED: https://huggingface.co/facebook/bart-large-mnli
# Zero Shot Classification - Natrual Language Inference
# basically this means we can list all the different skill names and the model will give us a 
# percentage probability that we are talking about each of them. We should be able to take the top
# value and pass valid information from the query into the skill class after extracting information
# using another model for tokenization. This took me an entire week of research to figure out -_-

# this will allow us to figure out what the query means 
# i.e we might not have to add the word "wolfram" into a query to send it to wolfram...

# import spacy


# nlp = spacy.load("en_core_web_sm")

# doc = nlp("What is the weather in toronto")

# for token in doc.ents:
# 	print(token)
# 	# if token.like_num:
# 		# print(tokenx)


# from transformers import AutoTokenizer, AutoModelForSequenceClassification

# tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")

# model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")

from transformers import pipeline
import spacy

# text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."

# doc = nlp(text)


class NLP:
    def __init__(self, candidate_labels=[]):
        self.candidate_labels = candidate_labels

        self.classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

        self.tokenclass = spacy.load("en_core_web_sm")


    def get_skill(self, sentence):
        return self.classifier(sentence, self.candidate_labels)

    def get_named_entities(self, sentence):
        return [[ent.text, ent.start_char, ent.end_char, ent.label_] for ent in self.tokenclass(sentence).ents]

        # for ent in doc.ents:
            # print(ent.text, ent.start_char, ent.end_char, ent.label_)

        # return self.tokenclass(sentence)


if __name__ == "__main__":
    nlp = NLP(['travel', 'cooking', 'dancing', 'weather'])
    print('==')
    print(nlp.get_skill("one day I will see the world"))
    print("yay!")
    print(nlp.get_skill("What is the weather today?"))
    print('==')

    print('====')
    print(nlp.get_named_entities("one day I will see the world"))
    print("yay!")
    print(nlp.get_named_entities("What is the weather today in london?"))
    print('====')
# sequence_to_classify = "one day I will see the world"
# candidate_labels = ['travel', 'cooking', 'dancing']
# print(classifier(sequence_to_classify, candidate_labels))


# import spacy
# from spacy.matcher import Matcher

# nlp = spacy.load("en_core_web_sm")
# matcher = Matcher(nlp.vocab)
# # Add match ID "HelloWorld" with no callback and one pattern
# pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
# matcher.add("HelloWorld", [pattern])

# doc = nlp("Hello, world! Hello world!")
# matches = matcher(doc)
# for match_id, start, end in matches:
#     string_id = nlp.vocab.strings[match_id]  # Get string representation
#     span = doc[start:end]  # The matched span
#     print(match_id, string_id, start, end, span.text)
reorganize codebase. Add in more of the framework for adding skills. 2023-06-14 15:57:12 +00:00			`# Natural Language Processing using something like https://spacy.io`

added starts of NLP processing using NLI model 2023-06-21 14:59:27 +00:00			`# THIS WORKS REALLY WELL FOR WHAT WE NEED: https://huggingface.co/facebook/bart-large-mnli`
			`# Zero Shot Classification - Natrual Language Inference`
			`# basically this means we can list all the different skill names and the model will give us a`
			`# percentage probability that we are talking about each of them. We should be able to take the top`
			`# value and pass valid information from the query into the skill class after extracting information`
			`# using another model for tokenization. This took me an entire week of research to figure out -_-`

reorganize codebase. Add in more of the framework for adding skills. 2023-06-14 15:57:12 +00:00			`# this will allow us to figure out what the query means`
			`# i.e we might not have to add the word "wolfram" into a query to send it to wolfram...`
added starts of NLP processing using NLI model 2023-06-21 14:59:27 +00:00
			`# import spacy`


			`# nlp = spacy.load("en_core_web_sm")`

			`# doc = nlp("What is the weather in toronto")`

			`# for token in doc.ents:`
			`# print(token)`
			`# # if token.like_num:`
			`# # print(tokenx)`


			`# from transformers import AutoTokenizer, AutoModelForSequenceClassification`

			`# tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")`

			`# model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")`

			`from transformers import pipeline`
			`import spacy`

			`# text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."`

			`# doc = nlp(text)`




			`class NLP:`
			`def __init__(self, candidate_labels=[]):`
			`self.candidate_labels = candidate_labels`

			`self.classifier = pipeline("zero-shot-classification",`
			`model="facebook/bart-large-mnli")`

			`self.tokenclass = spacy.load("en_core_web_sm")`



			`def get_skill(self, sentence):`
			`return self.classifier(sentence, self.candidate_labels)`

			`def get_named_entities(self, sentence):`
			`return [[ent.text, ent.start_char, ent.end_char, ent.label_] for ent in self.tokenclass(sentence).ents]`

			`# for ent in doc.ents:`
			`# print(ent.text, ent.start_char, ent.end_char, ent.label_)`

			`# return self.tokenclass(sentence)`



			`if __name__ == "__main__":`
			`nlp = NLP(['travel', 'cooking', 'dancing', 'weather'])`
			`print('==')`
			`print(nlp.get_skill("one day I will see the world"))`
			`print("yay!")`
			`print(nlp.get_skill("What is the weather today?"))`
			`print('==')`

			`print('====')`
			`print(nlp.get_named_entities("one day I will see the world"))`
			`print("yay!")`
			`print(nlp.get_named_entities("What is the weather today in london?"))`
			`print('====')`
			`# sequence_to_classify = "one day I will see the world"`
			`# candidate_labels = ['travel', 'cooking', 'dancing']`
			`# print(classifier(sequence_to_classify, candidate_labels))`


			`# import spacy`
			`# from spacy.matcher import Matcher`

			`# nlp = spacy.load("en_core_web_sm")`
			`# matcher = Matcher(nlp.vocab)`
			`# # Add match ID "HelloWorld" with no callback and one pattern`
			`# pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]`
			`# matcher.add("HelloWorld", [pattern])`

			`# doc = nlp("Hello, world! Hello world!")`
			`# matches = matcher(doc)`
			`# for match_id, start, end in matches:`
			`# string_id = nlp.vocab.strings[match_id] # Get string representation`
			`# span = doc[start:end] # The matched span`
			`# print(match_id, string_id, start, end, span.text)`