Module auto_osint_v.specific_entity_processor
This module will identify specific entities
These entities are extracted from the intelligence statement and stored in appropriate stores. Subprocesses to this module attempt to interrogate some of this information.
Expand source code
"""This module will identify specific entities
These entities are extracted from the intelligence statement and
stored in appropriate stores.
Subprocesses to this module attempt to interrogate some of this information.
"""
import os
import spacy
from spacy.tokens import DocBin
# Load the best model trained using Google Colab
try:
NER = spacy.load(os.getcwd() + "/NER_training_testing/train/model/model-best-from-colab")
except OSError:
try:
os.chdir("../auto_osint_v/")
NER = spacy.load("NER_training_testing/train/model/model-best-from-colab")
except FileNotFoundError:
os.chdir(os.getcwd() + "/auto_osint_v/")
NER = spacy.load("NER_training_testing/train/model/model-best-from-colab")
NER.add_pipe('sentencizer')
class EntityProcessor:
"""This class extracts the entities from a given statement
It provides methods for recognising the individual entities in a statement and storing
them appropriately.
"""
def __init__(self, file_handler_object):
"""Initialises variables to be used in this object.
Args:
file_handler_object: the file handler to be used for file IO operations
"""
self.file_handler = file_handler_object
self.irrelevant_words = ["it", "them", "they", "the", "he", "she", "his", "her", "we", "i",
"us", "me", "my", "here", "our"]
def store_words_from_label(self, read_statement):
"""This function stores recognised words in csv files
These files are associated with the label given to
the word.
Args:
read_statement: the intelligence statement read into current python instance
Returns
Nothing - stores info in files
"""
# Clean any leftover files from previous runs
self.file_handler.clean_directory("data_files/target_info_files")
text1 = NER(read_statement)
# changes added to eliminate duplicates and count number of mentions
# define list of words present
words_present = {}
for word in text1.ents:
# prints the entity and its label. e.g., "MARS LOC"
# print(word.text, "LABEL: ", word.label_)
# append the word to the list of words
key = word.text
# see python EAFP
try:
words_present[key][1] += 1
except KeyError:
words_present[key] = [word.label_, 1]
for text, [label, mentions] in words_present.items():
# Opens the relevant (based on word label) csv file and store the word text
# and number of mentions.
self.file_handler.open_label_file(label, text, mentions=mentions)
def get_entities_and_count(self, text_list, entity_dict):
"""Finds the entities from the given text. If they appear multiple times, increment value.
This only increments words one time per source. Only count independent mentions of entities.
Args:
text_list: The text to find and count entities from.
entity_dict: The dictionary to store these entities and their respective counts in.
Returns:
entity_dict modified with new entries.
"""
# split the text
for i, sentence in enumerate(text_list):
# if you see 'token indices sequence length is longer' warning
if len(sentence) > 500: # reduce this value
# split the sentence every 500 chars
text_list[i] = [sentence[j:j + 500] for j in range(0, len(sentence), 500)]
# flatten resulting list
text_list = [item for sublist in text_list for item in sublist]
entity_dict = self.add_entities_to_dict(entity_dict, text_list)
return list(entity_dict.items())
def add_entities_to_dict(self, entity_dict, texts):
"""Uses the NER.pipe to add entities to a given dictionary.
Args:
entity_dict: the given dictionary to add entities to.
texts: list of texts to process
Returns:
entity_dict, the given dictionary with added entities.
"""
words_present = []
# just add entities to dictionary as each key needs to be unique.
for doc in NER.pipe(texts):
for ent in doc.ents:
# set to lowercase for easy comparison
key = ent.text.lower()
# if the entity has not already been counted and is not an irrelevant word
if (key not in words_present) and (key not in self.irrelevant_words):
try:
entity_dict[key] += 1
except KeyError:
entity_dict[key] = 1
words_present.append(key)
return entity_dict
Classes
class EntityProcessor (file_handler_object)
-
This class extracts the entities from a given statement
It provides methods for recognising the individual entities in a statement and storing them appropriately.
Initialises variables to be used in this object.
Args: file_handler_object: the file handler to be used for file IO operations
Expand source code
class EntityProcessor: """This class extracts the entities from a given statement It provides methods for recognising the individual entities in a statement and storing them appropriately. """ def __init__(self, file_handler_object): """Initialises variables to be used in this object. Args: file_handler_object: the file handler to be used for file IO operations """ self.file_handler = file_handler_object self.irrelevant_words = ["it", "them", "they", "the", "he", "she", "his", "her", "we", "i", "us", "me", "my", "here", "our"] def store_words_from_label(self, read_statement): """This function stores recognised words in csv files These files are associated with the label given to the word. Args: read_statement: the intelligence statement read into current python instance Returns Nothing - stores info in files """ # Clean any leftover files from previous runs self.file_handler.clean_directory("data_files/target_info_files") text1 = NER(read_statement) # changes added to eliminate duplicates and count number of mentions # define list of words present words_present = {} for word in text1.ents: # prints the entity and its label. e.g., "MARS LOC" # print(word.text, "LABEL: ", word.label_) # append the word to the list of words key = word.text # see python EAFP try: words_present[key][1] += 1 except KeyError: words_present[key] = [word.label_, 1] for text, [label, mentions] in words_present.items(): # Opens the relevant (based on word label) csv file and store the word text # and number of mentions. self.file_handler.open_label_file(label, text, mentions=mentions) def get_entities_and_count(self, text_list, entity_dict): """Finds the entities from the given text. If they appear multiple times, increment value. This only increments words one time per source. Only count independent mentions of entities. Args: text_list: The text to find and count entities from. entity_dict: The dictionary to store these entities and their respective counts in. Returns: entity_dict modified with new entries. """ # split the text for i, sentence in enumerate(text_list): # if you see 'token indices sequence length is longer' warning if len(sentence) > 500: # reduce this value # split the sentence every 500 chars text_list[i] = [sentence[j:j + 500] for j in range(0, len(sentence), 500)] # flatten resulting list text_list = [item for sublist in text_list for item in sublist] entity_dict = self.add_entities_to_dict(entity_dict, text_list) return list(entity_dict.items()) def add_entities_to_dict(self, entity_dict, texts): """Uses the NER.pipe to add entities to a given dictionary. Args: entity_dict: the given dictionary to add entities to. texts: list of texts to process Returns: entity_dict, the given dictionary with added entities. """ words_present = [] # just add entities to dictionary as each key needs to be unique. for doc in NER.pipe(texts): for ent in doc.ents: # set to lowercase for easy comparison key = ent.text.lower() # if the entity has not already been counted and is not an irrelevant word if (key not in words_present) and (key not in self.irrelevant_words): try: entity_dict[key] += 1 except KeyError: entity_dict[key] = 1 words_present.append(key) return entity_dict
Methods
def add_entities_to_dict(self, entity_dict, texts)
-
Uses the NER.pipe to add entities to a given dictionary.
Args
entity_dict
- the given dictionary to add entities to.
texts
- list of texts to process
Returns
entity_dict, the given dictionary with added entities.
Expand source code
def add_entities_to_dict(self, entity_dict, texts): """Uses the NER.pipe to add entities to a given dictionary. Args: entity_dict: the given dictionary to add entities to. texts: list of texts to process Returns: entity_dict, the given dictionary with added entities. """ words_present = [] # just add entities to dictionary as each key needs to be unique. for doc in NER.pipe(texts): for ent in doc.ents: # set to lowercase for easy comparison key = ent.text.lower() # if the entity has not already been counted and is not an irrelevant word if (key not in words_present) and (key not in self.irrelevant_words): try: entity_dict[key] += 1 except KeyError: entity_dict[key] = 1 words_present.append(key) return entity_dict
def get_entities_and_count(self, text_list, entity_dict)
-
Finds the entities from the given text. If they appear multiple times, increment value.
This only increments words one time per source. Only count independent mentions of entities.
Args
text_list
- The text to find and count entities from.
entity_dict
- The dictionary to store these entities and their respective counts in.
Returns
entity_dict modified with new entries.
Expand source code
def get_entities_and_count(self, text_list, entity_dict): """Finds the entities from the given text. If they appear multiple times, increment value. This only increments words one time per source. Only count independent mentions of entities. Args: text_list: The text to find and count entities from. entity_dict: The dictionary to store these entities and their respective counts in. Returns: entity_dict modified with new entries. """ # split the text for i, sentence in enumerate(text_list): # if you see 'token indices sequence length is longer' warning if len(sentence) > 500: # reduce this value # split the sentence every 500 chars text_list[i] = [sentence[j:j + 500] for j in range(0, len(sentence), 500)] # flatten resulting list text_list = [item for sublist in text_list for item in sublist] entity_dict = self.add_entities_to_dict(entity_dict, text_list) return list(entity_dict.items())
def store_words_from_label(self, read_statement)
-
This function stores recognised words in csv files
These files are associated with the label given to the word.
Args
read_statement
- the intelligence statement read into current python instance
Returns Nothing - stores info in files
Expand source code
def store_words_from_label(self, read_statement): """This function stores recognised words in csv files These files are associated with the label given to the word. Args: read_statement: the intelligence statement read into current python instance Returns Nothing - stores info in files """ # Clean any leftover files from previous runs self.file_handler.clean_directory("data_files/target_info_files") text1 = NER(read_statement) # changes added to eliminate duplicates and count number of mentions # define list of words present words_present = {} for word in text1.ents: # prints the entity and its label. e.g., "MARS LOC" # print(word.text, "LABEL: ", word.label_) # append the word to the list of words key = word.text # see python EAFP try: words_present[key][1] += 1 except KeyError: words_present[key] = [word.label_, 1] for text, [label, mentions] in words_present.items(): # Opens the relevant (based on word label) csv file and store the word text # and number of mentions. self.file_handler.open_label_file(label, text, mentions=mentions)