Create Simple Featurizers
For whatever reason, you might not even want to use toil following the CATH hierarchy. You can still create a Featurizer calculating all features for a single protein using the following example. You can also use this example in your own workflow system, e.g. nextflow or a simple loop:
from functools import partial
from typing import Optional, Union
import pandas as pd
from Prop3D.common.featurizer import ProteinFeaturizer
def calculate_features(pdb_file: str, name: str, level: Optional[str] = "atom", work_dir: Optional[str] = None,
input_format: str = "pdb", update_features: Optional[list[str])],
to_file: Optional[Union[bool, str]] = None -> Union[str, pd.DataFrame]:
"""Calculate biophysical properties from a single protein
Parameters
----------
path : str
Path to local structure file
name : str
Name of protein
level : str
'atom', 'residue', 'edge' level
work_dir : str
Where to save all temporary files from all run software. If None, use cwd.
input_format : str
Input format, "pdb", "mmCIF", what ever Bio.PDB understands
update_features : list
List of features names or feature groups to update (while keeping the rest the same). Defualt is None, update all features
Returns
-------
Either the path to the h5 file or a pandas dataframe
"""
assert level in ['atom', 'residue', 'edges']
structure = ProteinFeaturizer(
pdb_file, name, None, work_dir,
force_feature_calculation=update_features is None,
update_features=update_features)
feature_calculator = {
"atom": structure.calculate_flat_features,
"residue": structure.calculate_flat_residue_features,
"edges": partial(structure.calculate_graph, edgelist=True)
}
df, _ = feature_calculator[level](write=False)
if level=="edges":
df["src"] = df["src"].apply(lambda s: "".join(map(str,s[1:])).strip())
df["dst"] = df["dst"].apply(lambda s: "".join(map(str,s[1:])).strip())
else:
df = structure.get_pdb_dataframe(include_features=True, coarse_grained = level=="residue")
if (isinstance(to_file, bool) and to_file) or isinstance(to_file, str):
if isinstance(to_file, bool):
to_file = f"{name}.h5"
df.to_hdf(to_file)
return to_file
return df