Source code for featurehub.user.session

import os
import json
import gc
import pandas as pd
import requests

from sqlalchemy.orm.exc import NoResultFound, MultipleResultsFound

from featurehub.admin.sqlalchemy_main import ORMManager
from featurehub.admin.sqlalchemy_declarative import (
    Problem, Feature, User, Metric
)
from featurehub.modeling import Model
from featurehub.util import run_isolated, get_source, TRY_AGAIN_LATER
from featurehub.evaluation import EvaluatorClient

[docs]class Session(object): """Represents a user's session within FeatureHub. Includes commands for discovering, testing, and registering new features. """ def __init__(self, problem, database = "featurehub"): self.__database = database self.__orm = ORMManager(database) self.__username = None with self.__orm.session_scope() as session: try: problem = session.query(Problem)\ .filter(Problem.name == problem)\ .one() self.__problem_id = problem.id except NoResultFound: raise ValueError("Invalid problem name: {}".format(problem)) except MultipleResultsFound: raise ValueError("Unexpected issue talking to database. " + TRY_AGAIN_LATER) # "log in" to the system self._login() # initialize evaluation client self.__evaluation_client = EvaluatorClient(self.__problem_id, self.__username, self.__orm) @property def __dataset(self): return self.__evaluation_client.dataset @property def __entities_featurized(self): return self.__evaluation_client.entities_featurized @property def __target(self): return self.__evaluation_client.target @staticmethod def _eval_server_post(route, data): url = "http://{}:{}/services/eval-server/{}".format( os.environ.get("EVAL_CONTAINER_NAME"), os.environ.get("EVAL_CONTAINER_PORT"), route ) headers = { "Authorization" : "token {}".format( os.environ.get("JUPYTERHUB_API_TOKEN")), } return requests.post(url=url, data=data, headers=headers) def _login(self): name = os.environ.get("USER") if not name: raise ValueError("Missing environment variable 'USER'. FeatureHub" " session not initialized.") with self.__orm.session_scope() as session: try: user = session.query(User)\ .filter(User.name == name)\ .one() self.__username = user.name except NoResultFound: data = { "database" : self.__orm.database } response = Session._eval_server_post("create-user", data) if response.ok: self.__username = name else: raise ValueError("Couldn't log in to FeatureHub. " \ + TRY_AGAIN_LATER) except MultipleResultsFound as e: raise ValueError("Unexpected error logging in to FeatureHub. " \ + TRY_AGAIN_LATER)
[docs] def get_sample_dataset(self): """Loads sample of problem training dataset. Returns the dataset a dict mapping table names to pandas DataFrames. Returns ------- dataset : dict (str => pd.DataFrame) A dict mapping table names to pandas DataFrames. target : pd.DataFrame A DataFrame that holds a single column: the target variable (label). Examples -------- >>> dataset = commands.get_sample_dataset() >>> dataset["users"] # -> returns DataFrame >>> dataset["stores"] # -> returns DataFrame """ self.__evaluation_client._load_dataset() # Return a *copy* of the dataset, ensuring we have enough memory. gc.collect() dataset = { table_name : self.__dataset[table_name].copy() for table_name in self.__dataset } target = self.__target.copy() # pylint: disable=no-member return (dataset, target)
[docs] def get_entity_features(self): """Loads preprocessed entity-level features of problem training dataset. The entity-level features are the same length as the entity DataFrame and the target DataFrame. Returns ------- entity_features : pd.DataFrame or None """ self.__evaluation_client._load_dataset() if not pd.DataFrame(self.__entities_featurized).empty: entity_features = self.__entities_featurized.copy() else: entity_features = None return entity_features
[docs] def discover_features(self, code_fragment=None): """Print features written by other users. A code fragment can be used to filter search results. For each feature, prints feature id, feature description, metrics, and source code. Parameters ---------- code_fragment : string, default=None Source code fragment to filter for. """ self._print_some_features(code_fragment, User.name != self.__username)
[docs] def print_my_features(self, code_fragment=None): """Print features written by me. A code fragment can be used to filter search results. For each feature, prints feature id, feature description, metrics, and source code. Parameters ---------- code_fragment : string, default=None Source code fragment to filter for. """ self._print_some_features(code_fragment, User.name == self.__username)
def _print_some_features(self, code_fragment, predicate): """Driver function for discover_features and print_my_features.""" with self.__orm.session_scope() as session: query = self._filter_features(session, code_fragment) # Filter only users that are not me query = query.join(Feature.user).filter(predicate) features = query.all() if features: for feature in features: # Join with metrics table query = session.query(Metric.name, Metric.value)\ .filter(Metric.feature_id == feature.id) metrics = query.all() metric_list = [(metric.name, metric.value) for metric in metrics] self._print_one_feature(feature.description, feature.id, feature.code, metric_list) else: print("No features found.")
[docs] def evaluate(self, feature): """Evaluate feature on training dataset and return key performance metrics. Runs the feature in an isolated environment to extract the feature values. Validates the feature values. Then, builds a model on that one feature and computes key cross-validated metrics. Prints results and returns a dictionary with (metric => value) entries. If the feature is invalid, prints reason and returns empty dictionary. Parameters ---------- feature : function Feature to evaluate """ if self.__evaluation_client.check_if_registered(feature, verbose=True): return return self.__evaluation_client.evaluate(feature)
[docs] def submit(self, feature, description=""): """Submit feature to server for evaluation on test data. If successful, registers feature in feature database and returns key performance metrics. Runs the feature in an isolated environment to extract the feature values. Validates the feature values. Then, builds a model on that one feature, performs cross validation, and returns key performance metrics. Parameters ---------- feature : function Feature to evaluate description : str Feature description. If left empty, will prompt for user imput. """ if not description: description = self._prompt_description() self.__evaluation_client.submit(feature, description)
def _filter_features(self, session, code_fragment): """Return query that filters this problem and given code fragment. Return a query object that filters features written for the appropriate problem by code snippets. This query object can be added to by the caller. """ filter_ = ( Feature.problem_id == self.__problem_id, ) if code_fragment: filter_ = filter_ + ( Feature.code.contains(code_fragment), ) return session.query(Feature).filter(*filter_) def _prompt_description(self): """Prompt user for description of feature""" print("First, enter feature description. Your feature description " "should be clear, concise, and meaningful to non-data scientists." " (If your feature fails to register, this description will be " "discarded.)") description = input("Enter description: ") print("") return description @staticmethod def _print_one_feature(feature_description, feature_id, feature_code, metric_list): """Print one feature in user-readable format. Parameters ---------- feature_description : str feature_id : int feature_code : str metric_list : MetricList Examples -------- >>> Session._print_one_feature("Age", 1, "def age(dataset): pass\n", metric_list_) ------------------- Feature id: 1 Feature description: Age Feature code: def age(dataset): pass Feature metrics: Accuracy: 0.5 ROC AUC: 0.35 """ result = "------------------\n" + \ "Feature id: {}\n".format(feature_id) + \ "Feature description: {}\n".format(feature_description) result += "\n" + \ "Feature code:\n" indent = " " tmp = [] for line in feature_code.split("\n"): tmp.append(indent + line) result += "\n".join(tmp) result += "\n" + \ "Feature metrics:\n" for metric_name, metric_value in metric_list: result += " {}: {}\n".format(metric_name, metric_value) print(result)