# -*- coding: utf-8 -*-
"""
MLConjug Main module.
| This module declares the main classes the user interacts with.
| The module defines the classes needed to interface with Machine Learning models.
"""
from .PyVerbiste import Verbiste, VerbInfo, Verb, VerbEn, VerbEs, VerbFr, VerbIt, VerbPt, VerbRo, ConjugManager
from .__init__ import Pipeline, SelectFromModel, CountVectorizer, LinearSVC, SGDClassifier
import random
from collections import defaultdict
import pickle
import pkg_resources
import re
from zipfile import ZipFile
from functools import partial
_RESOURCE_PACKAGE = __name__
_LANGUAGE_FULL = {'fr': 'Français',
'en': 'English',
'es': 'Español',
'it': 'Italiano',
'pt': 'Português',
'ro': 'Română',
}
_VERBS = {'fr': VerbFr,
'en': VerbEn,
'es': VerbEs,
'it': VerbIt,
'pt': VerbPt,
'ro': VerbRo,
}
_PRE_TRAINED_MODEL_PATH = {
'fr': '/'.join(('data', 'models', 'trained_model-fr-final.zip')),
'it': '/'.join(('data', 'models', 'trained_model-it-final.zip')),
'es': '/'.join(('data', 'models', 'trained_model-es-final.zip')),
'en': '/'.join(('data', 'models', 'trained_model-en-final.zip')),
'pt': '/'.join(('data', 'models', 'trained_model-pt-final.zip')),
'ro': '/'.join(('data', 'models', 'trained_model-ro-final.zip')),
}
_ALPHABET = {'fr': {'vowels': 'aáàâeêéèiîïoôöœuûùy',
'consonants': 'bcçdfghjklmnpqrstvwxyz'},
'en': {'vowels': 'aeiouy',
'consonants': 'bcdfghjklmnpqrstvwxyz'},
'es': {'vowels': 'aáeiíoóuúy',
'consonants': 'bcdfghjklmnñpqrstvwxyz'},
'it': {'vowels': 'aàeéèiìîoóòuùy',
'consonants': 'bcdfghjklmnpqrstvwxyz'},
'pt': {'vowels': 'aàãááeêéiíoóõuúy',
'consonants': 'bcçdfghjklmnpqrstvwxyz'},
'ro': {'vowels': 'aăâeiîouy',
'consonants': 'bcdfghjklmnpqrsșştțţvwxyz'},
}
[documentație]class Conjugator:
"""
| This is the main class of the project.
| The class manages the Verbiste data set and provides an interface with the scikit-learn pipeline.
| If no parameters are provided, the default language is set to french and the pre-trained french conjugation pipeline is used.
| The class defines the method conjugate(verb, language) which is the main method of the module.
:param language: string.
Language of the conjugator. The default language is 'fr' for french.
:param model: mlconjug.Model or scikit-learn Pipeline or Classifier implementing the fit() and predict() methods.
A user provided pipeline if the user has trained his own pipeline.
"""
def __init__(self, language='fr', model=None):
self.language = language
self.conjug_manager = ConjugManager(language=language)
if not model:
with ZipFile(pkg_resources.resource_stream(
_RESOURCE_PACKAGE, _PRE_TRAINED_MODEL_PATH[language])) as content:
with content.open('trained_model-{0}-final.pickle'.format(self.language), 'r') as archive:
model = pickle.loads(archive.read())
if model:
self.set_model(model)
else:
self.model = model
return
def __repr__(self):
return '{0}.{1}(language={2})'.format(__name__, self.__class__.__name__, self.language)
[documentație] def conjugate(self, verb, subject='abbrev'):
"""
| This is the main method of this class.
| It first checks to see if the verb is in Verbiste.
| If it is not, and a pre-trained scikit-learn pipeline has been supplied, the method then calls the pipeline
to predict the conjugation class of the provided verb.
| Returns a Verb object or None.
:param verb: string.
Verb to conjugate.
:param subject: string.
Toggles abbreviated or full pronouns.
The default value is 'abbrev'.
Select 'pronoun' for full pronouns.
:return: Verb object or None.
"""
verb = verb.lower()
prediction_score = 0
if not self.conjug_manager.is_valid_verb(verb):
raise ValueError(
_('The supplied word: {0} is not a valid verb in {1}.').format(verb, _LANGUAGE_FULL[self.language]))
if verb not in self.conjug_manager.verbs.keys():
if self.model is None:
return None
prediction = self.model.predict([verb])[0]
prediction_score = self.model.pipeline.predict_proba([verb])[0][prediction]
predicted = True
template = self.conjug_manager.templates[prediction]
index = - len(template[template.index(":") + 1:])
root = verb[:index]
verb_info = VerbInfo(verb, root, template)
conjug_info = self.conjug_manager.get_conjug_info(verb_info.template)
else:
predicted = False
infinitive = verb
verb_info = self.conjug_manager.get_verb_info(infinitive)
if verb_info is None:
return None
conjug_info = self.conjug_manager.get_conjug_info(verb_info.template)
if conjug_info is None:
return None
if predicted:
verb_object = _VERBS[self.language](verb_info, conjug_info, subject, predicted)
verb_object.predicted = predicted
verb_object.confidence_score = round(prediction_score, 3)
else:
verb_object = _VERBS[self.language](verb_info, conjug_info, subject)
return verb_object
[documentație] def set_model(self, model):
"""
Assigns the provided pre-trained scikit-learn pipeline to be able to conjugate unknown verbs.
:param model: scikit-learn Classifier or Pipeline.
"""
if not isinstance(model, Model):
print(_('Please provide an instance of a mlconjug.mlconjug.Model'))
raise ValueError
else:
self.model = model
return
[documentație]class DataSet:
"""
| This class holds and manages the data set.
| Defines helper methodss for managing Machine Learning tasks like constructing a training and testing set.
:param verbs_dict:
A dictionary of verbs and their corresponding conjugation class.
"""
def __init__(self, verbs_dict):
self.verbs_dict = verbs_dict
self.verbs = self.verbs_dict.keys()
self.templates = sorted(set([verb['template'] for verb in self.verbs_dict.values()]))
self.verbs_list = []
self.templates_list = []
self.dict_conjug = {}
self.train_input = []
self.train_labels = []
self.test_input = []
self.test_labels = []
self.construct_dict_conjug()
return
def __repr__(self):
return '{0}.{1}()'.format(__name__, self.__class__.__name__)
[documentație] def construct_dict_conjug(self):
"""
| Populates the dictionary containing the conjugation templates.
| Populates the lists containing the verbs and their templates.
"""
conjug = defaultdict(list)
verb_items = list(self.verbs_dict.items())
random.shuffle(verb_items)
for verb, info_verb in verb_items:
self.verbs_list.append(verb)
self.templates_list.append(self.templates.index(info_verb["template"]))
conjug[info_verb["template"]].append(verb)
self.dict_conjug = conjug
return
[documentație] def split_data(self, threshold=8, proportion=0.5):
"""
Splits the data into a training and a testing set.
:param threshold: int.
Minimum size of conjugation class to be split.
:param proportion: float.
Proportion of samples in the training set.
Must be between 0 and 1.
"""
if proportion <= 0 or proportion > 1:
raise ValueError(_('The split proportion must be between 0 and 1.'))
self.min_threshold = threshold
self.split_proportion = proportion
train_set = []
test_set = []
for template, lverbs in self.dict_conjug.items():
if len(lverbs) <= threshold:
for verbe in lverbs:
train_set.append((verbe, template))
else:
index = round(len(lverbs) * proportion)
for verbe in lverbs[:index]:
train_set.append((verbe, template))
for verbe in lverbs[index:]:
test_set.append((verbe, template))
random.shuffle(train_set)
random.shuffle(test_set)
self.train_input = [elmt[0] for elmt in train_set]
self.train_labels = [self.templates.index(elmt[1]) for elmt in train_set]
self.test_input = [elmt[0] for elmt in test_set]
self.test_labels = [self.templates.index(elmt[1]) for elmt in test_set]
return
[documentație]class Model(object):
"""
| This class manages the scikit-learn pipeline.
| The Pipeline includes a feature vectorizer, a feature selector and a classifier.
| If any of the vectorizer, feature selector or classifier is not supplied at instance declaration,
the __init__ method will provide good default values that get more than 92% prediction accuracy.
:param vectorizer: scikit-learn Vectorizer.
:param feature_selector: scikit-learn Classifier with a fit_transform() method
:param classifier: scikit-learn Classifier with a predict() method
:param language: language of the corpus of verbs to be analyzed.
"""
def __init__(self, vectorizer=None, feature_selector=None, classifier=None, language=None):
if not vectorizer:
vectorizer = CountVectorizer(analyzer=partial(extract_verb_features, lang=language, ngram_range=(2, 7)), binary=True)
if not feature_selector:
feature_selector = SelectFromModel(LinearSVC(penalty='l1', max_iter=12000, dual=False, verbose=2))
if not classifier:
classifier = SGDClassifier(loss='log', penalty='elasticnet', l1_ratio=0.15,
max_iter=4000, alpha=1e-5, random_state=42, verbose=2)
self.pipeline = Pipeline([('vectorizer', vectorizer),
('feature_selector', feature_selector),
('classifier', classifier)])
self.language = language
return
def __repr__(self):
return '{0}.{1}({2}, {3}, {4})'.format(__name__, self.__class__.__name__, *sorted(self.pipeline.named_steps))
[documentație] def train(self, samples, labels):
"""
Trains the pipeline on the supplied samples and labels.
:param samples: list.
List of verbs.
:param labels: list.
List of verb templates.
"""
self.pipeline = self.pipeline.fit(samples, labels)
return
[documentație] def predict(self, verbs):
"""
Predicts the conjugation class of the provided list of verbs.
:param verbs: list.
List of verbs.
:return: list.
List of predicted conjugation groups.
"""
prediction = self.pipeline.predict(verbs)
return prediction
if __name__ == "__main__":
pass