Source code for d3mdm.d3m

# -*- coding: utf-8 -*-

import logging

import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

LOGGER = logging.getLogger(__name__)


[docs]class BaseScraper(object): PATH = { 'seed': 'seed_datasets_current/', 'll0': 'training_datasets/LL0/', 'll1': 'training_datasets/LL1/' } DATASET_TYPES = dict() STATUS_FORCELIST = (403, 404, 500, 502, 504) RETRIES = 10 BACKOFF_FACTOR = 0.5
[docs] def get_session(self): session = requests.Session() retry = Retry(self.RETRIES, backoff_factor=self.BACKOFF_FACTOR, status_forcelist=self.STATUS_FORCELIST) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) return session
def __init__(self, skip_sublevels=False): self.session = self.get_session() self.sublevels = ['tables/'] if skip_sublevels else []
[docs] def get_url(self, url, raw=False): LOGGER.info("Getting URL %s", url) r = self.session.get(url) if raw: return r.content else: return BeautifulSoup(r.text, 'html.parser')
[docs] def get_datasets(self, dataset_type): url = self.BASE_URL + self.PATH[dataset_type] soup = self.get_url(url) links = self.get_links(soup) return [link for link in links if '.' not in link]
[docs] def get_sublevel(self, base_path, level, sublevels): data = dict() if sublevels and level not in sublevels: return data level_url = base_path + level + '/' soup = self.get_url(level_url) links = self.get_links(soup) for link in links: if '.' not in link: # This is a link to a subfolder if link[-1] == '/': link = link[:-1] data[link] = self.get_sublevel(level_url, link, sublevels) else: data[link] = self.get_url(level_url + link, raw=True) return data
[docs] def get_base_url(self, dataset_name): if not self.DATASET_TYPES: for dataset_type, path in self.PATH.items(): datasets = self.get_datasets(dataset_type) for dataset in datasets: self.DATASET_TYPES[dataset] = self.BASE_URL + path return self.DATASET_TYPES[dataset_name]
[docs] def scrape_dataset(self, dataset): base_url = self.get_base_url(dataset) + dataset + '/' root_links = [ link for link in self.get_links(self.get_url(base_url)) if dataset + '_dataset' in link or dataset + '_problem' in link ] sublevels = self.sublevels.copy() if sublevels: sublevels.extend(root_links) return { link: self.get_sublevel(base_url, link, sublevels) for link in root_links }
[docs]class D3MScraper(BaseScraper): BASE_URL = 'https://datadrivendiscovery.org/data/'
[docs] def get_session(self, username, password): session = requests.Session() retry = Retry(self.RETRIES, backoff_factor=self.BACKOFF_FACTOR) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) login_data = { 'username': username, 'password': password, 'target': '/data' } session.post('https://datadrivendiscovery.org/login', data=login_data) return session
def __init__(self, username, password, skip_sublevels=False): self.session = self.get_session(username, password) self.sublevels = ['tables/'] if skip_sublevels else []
[docs]class IPFSScraper(BaseScraper): BASE_URL = 'https://gateway.ipfs.io/ipfs/QmWsbzjogZTY3Laf8SErQ9azfuY7BWicBmQjP9SxwvtqTz/'
[docs]class BaseManager(object): def __init__(self, source='D3M', username=None, password=None, skip_sublevels=False): self.scraper = D3MScraper(username, password, skip_sublevels) self.scraper = IPFSScraper(skip_sublevels)
[docs] def load(self, dataset_name, raw='to_be_ignored'): return self.scraper.scrape_dataset(dataset_name)
[docs] def datasets(self): datasets = [] for dataset_type in self.scraper.PATH.keys(): datasets.extend(self.scraper.get_datasets(dataset_type)) return list(sorted(datasets))
[docs] def exists(self, dataset_name): return dataset_name in self.datasets()
[docs]class D3MManager(BaseManager): def __init__(self, username, password, skip_sublevels=False): self.scraper = D3MScraper(username, password, skip_sublevels)
[docs]class IPFSManager(BaseManager): def __init__(self, skip_sublevels=False): self.scraper = IPFSScraper(skip_sublevels)