Source code for smv.datasetrepo

#
# This file is licensed under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import itertools
import inspect
import pkgutil
import sys
import traceback

import smv
from smv.error import SmvRuntimeError
from smv.utils import smv_copy_array, lazy_property

"""Python implementations of IDataSetRepoPy4J and IDataSetRepoFactoryPy4J interfaces
"""

[docs]class DataSetRepoFactory(object): def __init__(self, smvApp): self.smvApp = smvApp
[docs] def createRepo(self): return DataSetRepo(self.smvApp)
[docs]class DataSetRepo(object): def __init__(self, smvApp): self.smvApp = smvApp # When SmvApp init with py_module_hotload flag (by default), # remove client modules from sys.modules to force reload of all client # code in the new transaction if (smvApp.py_module_hotload): self._clear_sys_modules() def _clear_sys_modules(self): """ Clear all client modules from sys.modules If modules have names like 'stage1.stage2.file.mod', then we have to clear all of set( 'stage1', 'stage1.stage2', 'stage1.stage2.file', 'stage1.stage2.file.mod' ) from the sys.modules dictionary to avoid getting cached modules from python when we contruct a new DSR. """ # The set of all user-defined code that needs to be decached # { 'stage1' } from our example user_code_fqns = set(self.smvApp.stages()).union(self.smvApp.userLibs()) fqn_stubs_to_remove = {fqn.split('.')[0] for fqn in user_code_fqns} for loaded_mod_fqn in list(sys.modules.keys()): for stubbed_fqn in fqn_stubs_to_remove: if loaded_mod_fqn == stubbed_fqn or loaded_mod_fqn.startswith(stubbed_fqn + '.'): sys.modules.pop(loaded_mod_fqn) def _for_name(self, name): """Dynamically load a module in a stage by its name. Similar to Java's Class.forName, but only looks in configured stages. """ lastdot = name.rfind('.') file_name = name[ : lastdot] ds_name = name[lastdot+1 : ] ds = None # if file isn't discoverable, module doesn't exist if file_name in self.all_project_pymodules: pymod = self.all_project_pymodules[file_name] # leave ds as None if the file exists but doesnt have an attribute with that name if hasattr(pymod, ds_name): ds = getattr(pymod, ds_name) return ds # Implementation of IDataSetRepoPy4J loadDataSet, which loads the dataset # from the most recent source. If the dataset does not exist, returns None. # However, if there is an error (such as a SyntaxError) which prevents the # user's file from being imported, the error will propagate back to the # DataSetRepoPython.
[docs] def loadDataSet(self, fqn): ds = None ds_class = self._for_name(fqn) if ds_class is not None: ds = ds_class(self.smvApp) # Python issue https://bugs.python.org/issue1218234 # need to invalidate inspect.linecache to make dataset hash work srcfile = inspect.getsourcefile(ds_class) if srcfile: inspect.linecache.checkcache(srcfile) return ds
[docs] def load_pymodule(self, fqn): mod = __import__(fqn) for subname in fqn.split('.')[1:]: mod = getattr(mod, subname) return mod
@lazy_property def all_project_pymodules(self): """An index of discoverable Python modules by fqn A Python module is discoverable if it is importable and belongs to a stage. We cache this information because walk_packages is slow and walking packages repeatedly while loading many datasets explodes the running time of operations like getting the graph of a project. """ def packages_in_stage(stage_name): stage_pymod = self.load_pymodule(stage_name) # where to recursively search for pymodules search_path = stage_pymod.__path__ # Prefix of all pymodules and packages found in this dir. This is a little strange - suppose we are # searching in stage foo.bar which has the following structure: # |-foo # |-bar # |-buzz.py # |-baz # |-file.py # When walk_packages finds the directory `baz`, it won't know that the package's name is `foo.bar.baz` - # it's not aware that bar is contained within another package. Unless we provide a prefix, it will think # that `foo.bar.baz's` name is just `baz`. This sort of makes sense, because if you added foo/bar to the # path then you could `import baz`. However, walk_packages will actually fail because it cannot # `import baz`, which it needs to do in order to get package details that inform the recursive search. # If there are no packages (only pymodules) in foo/bar, then `walk_packages` will succeed, but the output # names will be wrong (e.g. `buzz` instead of `foo.bar.buzz`). # `walk_packages` can generate AttributeError if the system has # Gtk modules, which are not designed to use with reflection or # introspection. Best action to take in this situation is probably # to simply suppress the error. def onerror(name): smv.logger.error("Skipping due to error during walk_packages: " + name) return pkgutil.walk_packages( path=search_path, prefix=stage_name + '.', onerror=onerror) stage_walker = itertools.chain(*(packages_in_stage(stage) for stage in self.smvApp.stages())) module_iter = (self.load_pymodule(name) for (_, name, is_pkg) in stage_walker if not is_pkg) return {pymod.__name__: pymod for pymod in module_iter} def _matchingClassesInPyModule(self, pymod, is_matching, skip_abs=True): """Finds all matching classes in a given python module. `is_matching` is called on each candidate object in the module. Only non-abstract classes where `is_matching` returned true are returned. """ matching_classes = [] pymod_name = pymod.__name__ # iterate over the attributes of the module, looking for SmvGenericModules for obj_name in dir(pymod): obj = getattr(pymod, obj_name) smv.logger.debug("Inspecting {} ({})".format(obj_name, type(obj))) # skip non-class objects if not inspect.isclass(obj): continue if not is_matching(obj): smv.logger.debug("Ignoring {} because it is not a match.".format(obj_name)) continue obj_fqn = obj.__module__ + "." + obj.__name__ # Class should have an fqn which begins with the module name. # Each package will contain all of the modules, classes, etc. # that were imported into it, and we need to exclude these # (so that we only count each module once) obj_declared_in_pymod = obj_fqn.startswith(pymod_name) if not obj_declared_in_pymod: smv.logger.debug("Ignoring {} because it was not " "declared in {}. (Note: it may " "be collected from another module)" .format(obj_name, pymod_name)) continue # Class should not be an ABC obj_is_abstract = inspect.isabstract(obj) if skip_abs and obj_is_abstract: # abc labels methods as abstract via the attribute __isabstractmethod__ is_abstract_method = lambda attr: getattr(attr, "__isabstractmethod__", False) abstract_methods = [name for name, _ in inspect.getmembers(obj, is_abstract_method)] smv.logger.debug("Ignoring {} because it is abstract ({} undefined)" .format(obj_name, ", ".join(abstract_methods))) continue smv.logger.debug("Collecting " + obj_name) matching_classes.append(obj) return matching_classes
[docs] def dataSetsForStage(self, stageName): def is_generic_module(klass): # We try to access the IsSmvDataSet attribute of the object. # if it does not exist, we will catch the the AttributeError # and skip the object, as it is not an SmvGenericModules. We # specifically check that IsSmvDataSet is identical to # True, because some objects like Py4J's JavaObject override # __getattr__ to **always** return something (so IsSmvDataSet # maybe truthy even though the object is not an SmvGenericModules). try: klass_is_smv_dataset = (klass.IsSmvDataSet is True) except AttributeError: klass_is_smv_dataset = False return klass_is_smv_dataset fqns = [] smv.logger.debug("Searching for SmvGenericModules in stage " + stageName) smv.logger.debug("sys.path=" + repr(sys.path)) for pymod_name, pymod in self.all_project_pymodules.items(): # The additional "." is necessary to prevent false positive, e.g. stage_2.M1 matches stage if pymod_name.startswith(stageName + "."): smv.logger.debug("Searching for SmvGenericModules in " + repr(pymod)) gen_modules = self._matchingClassesInPyModule(pymod, is_generic_module) fqns.extend([obj.fqn() for obj in gen_modules]) return fqns
def _all_providers(self): """scans user libraries and smv libraries for "provider" classes. Returns list of discovered provider classes """ def is_provider(klass): """A class is a provider if it has `IS_PROVIDER` and is not the base `SmvProvider` which returns empty string for provider type. """ try: klass_is_provider = (klass.IS_PROVIDER is True) and (klass.provider_type()) except AttributeError: klass_is_provider = False return klass_is_provider # providers can be in user libs dir or builtin smv prov_libs_names = self.smvApp.userLibs() + self.smvApp.semiLibs() + self.smvApp.smvLibs() prov_dict = {} for prov_lib_name in prov_libs_names: try: prov_lib = self.load_pymodule(prov_lib_name) except Exception as err: # ignore the prov_lib_name if there is any loading error traceback.print_exc() message = "{0}({1!r})".format(type(err).__name__, err.args) smv.logger.debug("Ignoring {} because it has error: {}".format(prov_lib_name, message)) continue providers = self._matchingClassesInPyModule(prov_lib, is_provider, skip_abs=False) for p in providers: p_fqn = p.provider_type_fqn() if p_fqn in prov_dict: raise SmvRuntimeError("multiple providers with same fqn: " + p_fqn) prov_dict[p_fqn] = p return prov_dict