import sys
import os
import sklearn.metrics
import pandas as pd
import dill
import urllib.parse
import signal
import json
import traceback
import numpy as np
from contextlib import contextmanager
from featurehub.admin.sqlalchemy_declarative import *
FEATURE_EXTRACTION_TIME_LIMIT = 40
[docs]def load_features_df(session, problem_name):
    """Get all features for a specific problem as a DataFrame."""
    problem_id = session.query(Problem.id).filter(Problem.name ==
            problem_name).scalar()
    df = extract_table(session, Feature)
    return df.loc[df["problem_id"] == problem_id, :] 
[docs]def recover_function(feature):
    """Recover compiled function from Feature object."""
    f = dill.loads(urllib.parse.unquote_to_bytes(feature["feature_dill_quoted"]))
    return f 
[docs]def append_feature_functions(features_df, inplace=False):
    """Recover compiled functions and append column to DataFrame."""
    feature_functions = features_df.apply(recover_function, axis=1)
    if inplace:
        features_df["feature_function"] = feature_functions
        return None
    else:
        features_df = features_df.copy()
        features_df["feature_function"] = feature_functions
        return features_df 
[docs]class TimeoutException(Exception):
    pass 
[docs]@contextmanager
def time_limit(seconds):
    def signal_handler(signum, frame):
        raise TimeoutException("Timed out!")
    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0) 
[docs]def build_feature_matrix(features_df, dataset, group_id, group_feature_indices,
        feature_extraction_time_limit=FEATURE_EXTRACTION_TIME_LIMIT):
    """Build feature matrix from human-generated features."""
    feature_functions = features_df["feature_function"]
    num_features = len(feature_functions)
    # extract feature values and names, giving a time limit on execution.
    features = []
    for index, (f, f_id) in enumerate(zip(feature_functions, group_feature_indices)):
        feature_name = "{}_{:04d}".format(group_id, f_id)
        frac = "{n}/{N}".format(n=index, N=num_features-1)
        print("Extracting feature {name:40.40} ({frac:>10.10})".format(
            name=feature_name, frac=frac), end='\r')
        try:
            with time_limit(feature_extraction_time_limit):
                feature = f(dataset)
        except TimeoutException as exc:
            print("Feature extraction (index {index}, name {name}) timed "
                  "out.".format(index=index, name=feature_name),
                  file=sys.stderr)
            # TODO needs entities table
            if features:
                feature = null_feature(features[0][0], name=feature_name)
            else:
                raise ValueError("Couldn't create null feature from empty"
                                 " features list.")
        except Exception as exc:
            print("Feature extraction (index {index}, name {name}) raised "
                  "Exception".format(index=index, name=feature_name),
                  file=sys.stderr)
            print(traceback.format_exc(), file=sys.stderr)
            # TODO needs entities table
            if features:
                feature = null_feature(features[0][0], name=feature_name)
            else:
                raise ValueError("Couldn't create null feature from empty"
                                 " features list.")
        features.append((feature, feature_name,))
    print("\ndone")
    feature_matrix = pd.concat([pd.DataFrame(feature[0]) for feature in features], axis=1)
    feature_names = [features[1] for feature in features]
    feature_matrix.columns = feature_names
    return feature_matrix 
[docs]def null_feature(entities, name='null_feature', fill=0.0):
    """Create null feature of an appropriate length."""
    index = entities.index
    df = pd.DataFrame(index=index)
    df[name] = fill
    return df 
[docs]def save_feature_matrix(feature_matrix, problem_name, split, suffix):
    name = "output/features/{}_{}_{}.pkl.bz2".format(problem_name, split, suffix)
    save_table(feature_matrix, name)  
[docs]def load_feature_matrix(problem_name, split, suffix):
    name = "output/features/{}_{}_{}.pkl.bz2".format(problem_name, split, suffix)
    return load_table(name) 
[docs]def build_and_save_all_features(commands, session, suffix, splits=[],
        problem_names=[], features_on_disk=False):
    """Build and save feature matrices.
    Examples
    --------
    >>> with orm.session_scope() as session:
            build_and_save_all_features(commands, session, suffix)
    """
    # assumes problem names/orderings constant across extractions
    query = session.query(Problem).filter(Problem.name != "demo")
    if problem_names:
        query = query.filter(Problem.name.in_(problem_names))
    result = query.all()
    problem_names = [r.name for r in result]
    problem_ids   = [r.id for r in result]
    if not splits:
        splits = ["train", "test"]
    for problem_name, problem_id in zip(problem_names, problem_ids):
        for split in splits:
            print("Processing features for problem {}, split {}"
                    .format(problem_name, split))
            # load data
            _, dataset, entities_featurized, target = \
                
commands.load_dataset(problem_name=problem_name, split=split)
            # extract features and indices
            if features_on_disk:
                tmp = load_table1("output/tables/features", suffix)
            else:
                tmp = extract_table(session, Feature)
            group_feature_indices = list(np.flatnonzero(tmp["problem_id"] == problem_id))
            features_df = tmp.loc[group_feature_indices, :]
    
            # compute feature functions
            append_feature_functions(features_df, inplace=True)
            feature_matrix = build_feature_matrix(features_df, dataset,
                    suffix, group_feature_indices)
            # save results
            save_feature_matrix(feature_matrix, problem_name, split, suffix) 
[docs]def extract_and_save_all_tables(session, suffix):
    for mapper in [Feature, Problem, User, EvaluationAttempt, Metric]:
        df = extract_table(session, mapper)
        save_table1(df, mapper.__tablename__, suffix) 
[docs]def save_table1(df, name, suffix):
    underscore = "_" if suffix else ""
    name1 = name + underscore + suffix + ".pkl.bz2"
    save_table(df, name1) 
[docs]def save_table(df, name):
    fullname = os.path.join(os.path.expanduser("~"), "notebooks", name)
    df.to_pickle(fullname) 
[docs]def load_table1(name, suffix):
    underscore = "_" if suffix else ""
    name1 = name + underscore + suffix + ".pkl.bz2"
    return load_table(name1) 
[docs]def load_table(name):
    fullname = os.path.join(os.path.expanduser("~"), "notebooks", name)
    return pd.read_pickle(fullname) 
[docs]def prepare_automl_file_name(problem_name, split, suffix):
    name = "automl_{}_{}_{}.pkl".format(problem_name, split, suffix)
    dirname = os.path.join(os.path.expanduser("~"), "notebooks", "output")
    if not os.path.exists(dirname):
        os.makedirs(dirname, exist_ok=True)
    return os.path.join(dirname, name) 
[docs]def load_dataset_from_dir(session, data_dir, problem_name):
    # query db for import parameters to load files
    problem = session.query(Problem)\
            
.filter(Problem.name == problem_name).one()
    problem_files = json.loads(problem.files)
    problem_table_names = json.loads(problem.table_names)
    problem_entities_featurized_table_name = \
        
problem.entities_featurized_table_name
    problem_target_table_name = problem.target_table_name
    problem_data_dir = os.path.join(data_dir, problem_name)
    # load entities and other tables
    # load other tables
    dataset = {}
    for (table_name, filename) in zip (problem_table_names,
            problem_files):
        if table_name == problem_entities_featurized_table_name or \
           
table_name == problem_target_table_name:
            continue
        abs_filename = os.path.join(problem_data_dir, filename)
        dataset[table_name] = pd.read_csv(abs_filename,
                low_memory=False, header=0)
    # if empty string, we simply don't have any features to add
    if problem_entities_featurized_table_name:
        cols = list(problem_table_names)
        ind_features = cols.index(problem_entities_featurized_table_name)
        abs_filename = os.path.join(problem_data_dir,
                problem_files[ind_features])
        entities_featurized = pd.read_csv(abs_filename,
                low_memory=False, header=0)
    # load target
    cols = list(problem_table_names)
    ind_target = cols.index(problem_target_table_name)
    abs_filename = os.path.join(problem_data_dir,
            problem_files[ind_target])
    # target might not exist if we are making predictions on unseen
    # test data
    if os.path.exists(abs_filename):
        target = pd.read_csv(abs_filename, low_memory=False, header=0)
    else:
        target = None
    return dataset, entities_featurized, target 
[docs]def save_submission(df, problem_name, split_train, split_test, suffix):
    underscore = "_" if suffix else ""
    name1 = os.path.join("output", "submissions",
            "submission_{}_{}_{}{}{}.csv".format(problem_name, split_train,
                split_test, underscore, suffix))
    fullname = os.path.join(os.path.expanduser("~"), "notebooks", name1)
    df.to_csv(fullname, index=True, header=True)
    print("Submission saved as {}".format(fullname))