Create Custom Featurizers
Prop3D includes the ability to add your own software into the featurization pipeline. All you need is a docker image (if using custom software), a Prop3D.meadowlark.CustomFeaturizer to run custom software and parse results, and updated yaml file with group and feature definitions.
When creating your own Docker images, please try to follow the UCSC CGL Docker Lib Philosophy. However, if the docker image is already built or the wrapper.sh endpoint does not work, just add an attribute your CustomFeaturizer subclass specifying the entry point of software inside the Docker image.
from Prop3D.parsers.contianer import Container
from Prop3D.custom_featurizers import CustomFeaturizer, StructureType
class NewFeaturizer(CustomFeaturizer, Container):
ENTRYPOINT = /path/to/software #Only add if you the UCSC CGL theory didn't work or you don't want to use it
IMAGE = "docker://USER/image" #Specify the path to docker image
PARAMETERS = ([
[("parameter_name", "type", ["--format_option", "{}"]) #parameter name (used as var name in python), parameter value type, formatting options
]
def calculate_prop3D(self, path: str, structure: StructureType) -> tuple[Union[pd.DataFrame, None], Union[pd.DataFrame, None]]:
... code to calculate your features ...
return atom_features, residue_features
Summary
Create a Docker image for the new software of interest, uploaded to Docker cloud
Subclass
Prop3D.custom_featurizers.CustomFeaturizerandProp3D.parsers.container.Containerto run the docker image and parse the results. This subclass must contain acalculate_prop3Dmethod that returns a pandas DataFrame.Modify the custom features YAML file (
Prop3D/custom_featurizers/custom_features.yaml) by creating a new top-level group it will be in, and then each feature with the specified argument including the new Container you just subclassed. If it exists, make sure it is not hidden with “.” in front of its name.
Example with BioPython (Half Sphere Exposure)
First, create a new CustomFeaturizer:
from typing import Union
import pandas as pd
from Bio.PDB import Structure, HSExposure
from Prop3D.custom_featurizers import CustomFeaturizer, StructureType
class HalfSphereExposure(CustomFeaturizer):
def calculate_prop3D(self, path: str, structure: StructureType) -> tuple[Union[pd.DataFrame, None], Union[pd.DataFrame, None]]:
hse = HSExposure()
# Calculate HSEalpha
exp_ca = hse.HSExposureCA(structure)
residue_result = pd.DataFrame(
[(res_id, exposure) for (chain_id, res_id), exposure in iter(exp_ca)],
columns=["res_id", "hs_exposure"])
residue_result = residue_result.set_index("res_id")
residue_result = residue_result.assign(hs_exposure_norm=
(residue_result.hs_exposure-residue_result.hs_exposure.mean())/residue_result.hs_exposure.std())
#No need to calculate is_exposed or is_buried, since they are defined in YAML file with
# 'from_feature', 'threshold', and 'equality'. But you can if you want.
return None, residue_result
- hs_exposure: # Unique Group Name
- name: "hs_exposure" # Unique Feature Name
default: 0.
aggregate: mean # How to combine for multiple values, e.g. for overlapping atoms in a voxel or if atom feature, how to combine to create a residue level feature
residue: true #True if calculated at the residue level
bool: false #True if value is Boolean
min: -180
max: 180
parser: Prop3D.custom_featurizers.HalfSphereExposure
- name: "hs_exposure_norm"
default: 0.
aggregate: mean
residue: true
bool: false
min: 0
max: 1
parser: Prop3D.custom_featurizers.HalfSphereExposure
- name: 'is_exposed' #Create new Boolean value based on thresholding previous values
default: 0
aggregate: max
residue: true
bool: true
min: 0.
max: 1.
threshold: 0.2
from_feature: "hs_exposure_norm"
equality: ">="
parser: Prop3D.custom_featurizers.HalfSphereExposure
- name: 'is_buried'
default: 0
aggregate: max
residue: true
bool: true
min: 0.
max: 1.
threshold: 0.2
from_feature: "hs_exposure_norm"
equality: "<"
parser: Prop3D.custom_featurizers.HalfSphereExposure
Example with custom software (Fpocket)
In this example we will use Fpocket to calculate druggability scores for every atom. A docker image already exists for Fpocket.
First, create a new CustomFeaturizer Container to run the fpocket docker image and parse the results:
import shutil
from typing import Union
from pathlib import Path
import pandas as pd
from Bio import PDB
from Prop3D.parsers.container import Container
from Prop3D.custom_featurizers import CustomFeaturizer, StructureType
class Druggability(CustomFeaturizer, Container): #Sublcass Container to have the ablity to run docker containers without leaving python
IMAGE = "docker://fpocket/fpocket"
ENTRYPOINT = "fpocket"
PARAMETERS = [
"-d", #Always output condesned form, easier to parse
[("pdb_file", "str", ["-f", "{}"]) ]#parameter name (used as var name in python), parameter value type, formatting options
]
CONTAINER_FILE_PREFIX = "/WORKDIR" #Set the working direcotry in the container
def calculate_prop3D(self, path: str, structure: StructureType, clean: bool = True) -> tuple[Union[pd.DataFrame, None], Union[pd.DataFrame, None]]:
"""Run fpocket for input structure and save the druggability score of the pocket to each atom in the pocket
"""
#Call FPocket
output = next(self(pdb_file=path))
results_dir = Path(self.work_dir) / f"{Path(path.stem)}_out"
if not results_dir.is_dir():
raise RuntimeError("Error running Fpocket")
#Pocket info, dataframe with cols: cav_id drug_score volume nb_asph inter_chain apol_asph_proportion mean_asph_radius as_density mean_asph_solv_acc mean_loc_hyd_dens flex hydrophobicity_score volume_score charge_score polarity_score a0_apol a0_pol af_apol af_pol n_abpa ala cys asp glu phe gly his ile lys leu met asn pro gln arg ser thr val trp tyr chain_1_type chain_2_type num_res_chain_1 num_res_chain_2 lig_het_tag name_chain_1 name_chain_2
pocket_info = pd.read_csv(output, sep=" ")[["cav_id", "drug_score"]].set_index("cav_id")
atom_results = pd.DataFrame(0., index=[a.serial for a in structure.get_atoms], columns=["druggability_score"])
for pocket_file in results_dir.glob("*_atm.pdb"):
cav_id = pocket_file.stem.split("_").replace("pocket", "")
drug_score = pocket_info.drug_score[cav_id]
with pocket_file.open() as f:
for line in f:
if line.startswith("ATOM"):
serial_number = int(line[6:11])
if atom_results.loc[serial_number].druggability_score < drug_score:
atom_results.loc[serial_number, "druggability_score"] = drug_score
if clean:
shutil.rmtree(results_dir)
return atom_results, None
Next, we will append the new feature group along with its associated features to the custom features yaml:
- druggability: # Unique Group Name
- name: "druggability_score" # Unique Feature Name
default: 0.
aggregate: mean # How to combine for multiple values, e.g. for overlapping atoms in a voxel or if atom feature, how to combine to create a residue level feature
residue: false #True if calculated at the residue level
bool: false #True if value is Boolean
min: 0.
max: 1.
parser: Prop3D.custom_featurizers.Druggability
- name: 'is_druggable' #Create new Boolean value based on thresholding previous values
default: 0
aggregate: max
residue: true
bool: true
min: 0.
max: 1.
threshold: 0.5
from_feature: "druggability_score"
equality: ">="
parser: Prop3D.custom_featurizers.Druggability
- name: 'not_druggable'
default: 0
aggregate: max
residue: true
bool: true
min: 0.
max: 1.
threshold: 0.5
from_feature: "druggability_score"
equality: "<"
parser: Prop3D.custom_featurizers.Druggability