added starts of NLP processing using NLI model

2023-06-21 10:59:27 -04:00 · 2023-06-21 10:59:27 -04:00 · f9d54838d8
commit f9d54838d8
parent 9b70ea6a22
1 changed files with 94 additions and 0 deletions
--- a/backend/NLP.py
+++ b/backend/NLP.py
@ -1,4 +1,98 @@
 # Natural Language Processing using something like https://spacy.io

+# THIS WORKS REALLY WELL FOR WHAT WE NEED: https://huggingface.co/facebook/bart-large-mnli
+# Zero Shot Classification - Natrual Language Inference
+# basically this means we can list all the different skill names and the model will give us a 
+# percentage probability that we are talking about each of them. We should be able to take the top
+# value and pass valid information from the query into the skill class after extracting information
+# using another model for tokenization. This took me an entire week of research to figure out -_-
+
 # this will allow us to figure out what the query means 
 # i.e we might not have to add the word "wolfram" into a query to send it to wolfram...
+
+# import spacy
+
+
+# nlp = spacy.load("en_core_web_sm")
+
+# doc = nlp("What is the weather in toronto")
+
+# for token in doc.ents:
+# 	print(token)
+# 	# if token.like_num:
+# 		# print(tokenx)
+
+
+# from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+# tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-mnli")
+
+# model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large-mnli")
+
+from transformers import pipeline
+import spacy
+
+# text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
+
+# doc = nlp(text)
+
+
+
+
+class NLP:
+    def __init__(self, candidate_labels=[]):
+        self.candidate_labels = candidate_labels
+
+        self.classifier = pipeline("zero-shot-classification",
+                      model="facebook/bart-large-mnli")
+
+        self.tokenclass = spacy.load("en_core_web_sm")
+
+
+
+    def get_skill(self, sentence):
+        return self.classifier(sentence, self.candidate_labels)
+
+    def get_named_entities(self, sentence):
+        return [[ent.text, ent.start_char, ent.end_char, ent.label_] for ent in self.tokenclass(sentence).ents]
+
+        # for ent in doc.ents:
+            # print(ent.text, ent.start_char, ent.end_char, ent.label_)
+
+        # return self.tokenclass(sentence)
+
+
+
+if __name__ == "__main__":
+    nlp = NLP(['travel', 'cooking', 'dancing', 'weather'])
+    print('==')
+    print(nlp.get_skill("one day I will see the world"))
+    print("yay!")
+    print(nlp.get_skill("What is the weather today?"))
+    print('==')
+
+    print('====')
+    print(nlp.get_named_entities("one day I will see the world"))
+    print("yay!")
+    print(nlp.get_named_entities("What is the weather today in london?"))
+    print('====')
+# sequence_to_classify = "one day I will see the world"
+# candidate_labels = ['travel', 'cooking', 'dancing']
+# print(classifier(sequence_to_classify, candidate_labels))
+
+
+# import spacy
+# from spacy.matcher import Matcher
+
+# nlp = spacy.load("en_core_web_sm")
+# matcher = Matcher(nlp.vocab)
+# # Add match ID "HelloWorld" with no callback and one pattern
+# pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
+# matcher.add("HelloWorld", [pattern])
+
+# doc = nlp("Hello, world! Hello world!")
+# matches = matcher(doc)
+# for match_id, start, end in matches:
+#     string_id = nlp.vocab.strings[match_id]  # Get string representation
+#     span = doc[start:end]  # The matched span
+#     print(match_id, string_id, start, end, span.text)