Source code for xenonpy.datatools.preset

#  Copyright (c) 2021. yoshida-lab. All rights reserved.
#  Use of this source code is governed by a BSD-style
#  license that can be found in the LICENSE file.

from itertools import zip_longest
from pathlib import Path
from warnings import warn

import numpy as np
import pandas as pd
from pymatgen.ext.matproj import MPRester
from ruamel.yaml import YAML
from tqdm import tqdm

from xenonpy._conf import __cfg_root__
from xenonpy.datatools.dataset import Dataset
from xenonpy.utils import config, get_dataset_url, get_sha256, Singleton

__all__ = ['Preset', 'preset']


[docs]class Preset(Dataset, metaclass=Singleton): """ Load data from embed dataset in XenonPy's or user create data saved in ``~/.xenonpy/cached`` dir. Also can fetch data by http request. This is sample to demonstration how to use is. Also see parameters documents for details. :: >>> from xenonpy.datatools import preset >>> elements = preset.elements >>> elements.info() <class 'pandas.core.frame.DataFrame'> Index: 118 entries, H to Og Data columns (total 74 columns): atomic_number 118 non-null int64 atomic_radius 88 non-null float64 atomic_radius_rahm 96 non-null float64 atomic_volume 91 non-null float64 atomic_weight 118 non-null float64 boiling_point 96 non-null float64 brinell_hardness 59 non-null float64 bulk_modulus 69 non-null float64 ... """ __dataset__ = ('elements', 'elements_completed', 'atom_init') __builder__ = ('mp_samples',) # set to check params def __init__(self): self._dataset = Path(__cfg_root__) / 'dataset' self._ext_data = config('ext_data') super().__init__( str(self._dataset), config('userdata'), *self._ext_data, backend='pandas', prefix=('dataset',)) yaml = YAML(typ='safe') yaml.indent(mapping=2, sequence=4, offset=2) self._yaml = yaml
[docs] def sync(self, data, to=None): """ load data. .. note:: Try to load data from local at ``~/.xenonpy/dataset``. If no data, try to fetch them from remote repository. Args ----------- data: str name of data. to: str The version of repository. See Also: https://github.com/yoshida-lab/dataset/releases Returns ------ ret:DataFrame or Saver or local file path. """ dataset = self._dataset / (data + '.pd.xz') sha256_file = self._dataset / 'sha256.yml' # check sha256 value # make sure sha256_file file exist. sha256_file.touch() sha256 = self._yaml.load(sha256_file) if sha256 is None: sha256 = {} # fetch data from source if not in local if not to: url = get_dataset_url(data) else: url = get_dataset_url(data, to) print('fetching dataset `{0}` from {1}.'.format(data, url)) self.from_http(url, save_to=str(self._dataset)) sha256_ = get_sha256(str(dataset)) sha256[data] = sha256_ self._yaml.dump(sha256, sha256_file) self._make_index(prefix=['dataset'])
[docs] def build(self, *keys, save_to=None, **kwargs): # build materials project dataset def mp_builder(api_key, mp_ids): # print('Will fetch %s inorganic compounds from Materials Project' % len(mp_ids)) # split requests into fixed number groups # eg: grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx def grouper(iterable, n, fillvalue=None): """Collect data into fixed-length chunks or blocks""" args = [iter(iterable)] * max(n, 1) return zip_longest(fillvalue=fillvalue, *args) # the following props will be fetched mp_props = [ 'band_gap', 'density', 'volume', 'material_id', 'pretty_formula', 'elements', 'efermi', 'e_above_hull', 'formation_energy_per_atom', 'final_energy_per_atom', 'unit_cell_formula', 'structure' ] entries = [] mpid_groups = [g for g in grouper(mp_ids, len(mp_ids) // 10)] with MPRester(api_key) as mpr: for group in tqdm(mpid_groups): mpid_list = [id for id in filter(None, group)] chunk = mpr.query({"material_id": {"$in": mpid_list}}, mp_props) entries.extend(chunk) df = pd.DataFrame(entries, index=[e['material_id'] for e in entries]) df = df.drop('material_id', axis=1) df = df.rename(columns={'unit_cell_formula': 'composition'}) df = df.reindex(columns=sorted(df.columns)) return df for key in keys: if key is 'mp_samples': if 'api_key' not in kwargs: raise RuntimeError('api key of materials projects database is needed') if 'mp_ids' in kwargs: ids = kwargs['mp_ids'] if isinstance(ids, (list, tuple)): mp_ids = ids elif isinstance(ids, str): mp_ids = [s.decode('utf-8') for s in np.loadtxt(ids, 'S20')] else: raise ValueError( 'parameter `mp_ids` can only be a str to specific the ids file path' 'or a list-like object contain the ids') else: ids = Path(__file__).absolute().parent / 'mp_ids.txt' mp_ids = [s.decode('utf-8') for s in np.loadtxt(str(ids), 'S20')] data = mp_builder(kwargs['api_key'], mp_ids) if not save_to: save_to = Path(config('userdata')) / 'mp_samples.pd.xz' save_to = save_to.expanduser().absolute() data.to_pickle(save_to) self._make_index(prefix=['dataset']) return raise ValueError('no available key(s) in %s, these can only be %s' % (keys, self.__builder__))
def _check(self, data): dataset = self._dataset / (data + '.pd.xz') sha256_file = self._dataset / 'sha256.yml' # fetch data from source if not in local if not dataset.exists(): raise RuntimeError( "data {0} not exist, please run <preset.sync('{0}')> to download from the repository".format(data), 'See also: https://xenonpy.readthedocs.io/en/latest/tutorials/1-dataset.html' ) # check sha256 value sha256_file.touch() # make sure sha256_file file exist. sha256 = self._yaml.load(sha256_file) if sha256 is None: sha256 = {} if data not in sha256: sha256_ = get_sha256(str(dataset)) sha256[data] = sha256_ self._yaml.dump(sha256, sha256_file) else: sha256_ = sha256[data] if sha256_ != config(data): warn( "local version {0} is different from the latest version {1}." "you can use <Preset.sync('{0}', to='{1}')> to fix it.".format(data, config('db_version')), RuntimeWarning) @property def elements(self): """ Element properties from embed dataset. These properties are summarized from `mendeleev`_, `pymatgen`_, `CRC Handbook`_ and `magpie`_. See Also: :doc:`features` .. _mendeleev: https://mendeleev.readthedocs.io .. _pymatgen: http://pymatgen.org/ .. _CRC Handbook: http://hbcponline.com/faces/contents/ContentsSearch.xhtml .. _magpie: https://bitbucket.org/wolverton/magpie Returns ------- DataFrame: element properties in pd.DataFrame """ self._check('elements') return self.dataset_elements @property def atom_init(self): """ The initialization vector for each element. See Also: https://github.com/txie-93/cgcnn#usage """ self._check('atom_init') return self.dataset_atom_init @property def elements_completed(self): """ Completed element properties. [MICE]_ imputation used .. [MICE] `Int J Methods Psychiatr Res. 2011 Mar 1; 20(1): 40–49.`__ doi: `10.1002/mpr.329 <10.1002/mpr.329>`_ .. __: https://www.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=pubmed&retmode=ref&cmd=prlinks&id=21499542 See Also: :doc:`features` Returns ------- imputed element properties in pd.DataFrame """ self._check('elements_completed') return self.dataset_elements_completed
preset = Preset()