Source code for pygenprop.assign

#!/usr/bin/env python

"""
Created by: Lee Bergstrand (2017)

Description: Functions for assigning genome properties.
"""
from pygenprop.evidence import Evidence
from pygenprop.functional_element import FunctionalElement
from pygenprop.genome_property import GenomeProperty
from pygenprop.step import Step


[docs]class AssignmentCache(object): """ This class contains a representation of precomputed assignment results and InterPro member database matches. """ def __init__(self, interpro_member_database_identifiers: list = None, sample_name=None): if interpro_member_database_identifiers: interpro_member_database_identifiers = set(interpro_member_database_identifiers) self.property_assignments = {} self.step_assignments = {} self.interpro_member_database_identifiers = interpro_member_database_identifiers self.sample_name = sample_name
[docs] def cache_property_assignment(self, genome_property_identifier: str, assignment: str): """ Stores cached assignment results for a genome property. :param genome_property_identifier: The identifier of genome property. :param assignment: An assignment of YES, NO or PARTIAL for the given genome property. """ self.property_assignments[genome_property_identifier] = assignment
[docs] def get_property_assignment(self, genome_property_identifier): """ Retrieves cached assignment results for a genome property. :param genome_property_identifier: The identifier of genome property. :return: An assignment of YES, NO or PARTIAL for the given genome property. """ return self.property_assignments.get(genome_property_identifier)
[docs] def cache_step_assignment(self, genome_property_identifier: str, step_number: int, assignment: str): """ Stores cached assignment results for a genome property step. :param genome_property_identifier: The identifier of the genome property for which the step belongs. :param step_number: The steps number. :param assignment: An assignment of YES or NO for the given step. """ parent_genome_property_step_assignments = self.step_assignments.get(genome_property_identifier) if parent_genome_property_step_assignments: parent_genome_property_step_assignments[step_number] = assignment else: self.step_assignments[genome_property_identifier] = {step_number: assignment}
[docs] def get_step_assignment(self, genome_property_identifier: str, step_number: int): """ Retrieves cached assignment results for a genome property step. :param genome_property_identifier: The identifier of the genome property for which the step belongs. :param step_number: The steps number. :return: An assignment of YES or NO for the given step. """ parent_genome_property_step_results = self.step_assignments.get(genome_property_identifier) if parent_genome_property_step_results: found_step_assignment = parent_genome_property_step_results.get(step_number) if found_step_assignment: cached_step_assignment = found_step_assignment else: cached_step_assignment = None else: cached_step_assignment = None return cached_step_assignment
[docs] def flush_property_from_cache(self, genome_property_identifier): """ Remove a genome property from the cache using its identifier. :param genome_property_identifier: The identifier of the property to remove from the cache. """ self.property_assignments.pop(genome_property_identifier, None) self.step_assignments.pop(genome_property_identifier, None)
@property def genome_property_identifiers(self): """ Creates a set of identifiers belonging to the genome properties cached. :return: A set of genome property identifiers. """ return list(self.property_assignments.keys())
[docs]def assign_genome_property(assignment_cache: AssignmentCache, genome_property: GenomeProperty): """ Recursively assigns a result to a genome property and its children. :param assignment_cache: A cache containing step and property assignments and InterPro member database matches. :param genome_property: The genome property to assign the results to. :return: The assignment results for the genome property. """ current_step_assignments = {} required_steps = genome_property.required_steps for step in genome_property.steps: current_step_assignments[step.number] = assign_step(assignment_cache, step) if required_steps: required_step_numbers = [step.number for step in required_steps] required_step_values = [step_value for step_number, step_value in current_step_assignments.items() if step_number in required_step_numbers] genome_property_assignment = calculate_property_assignment_from_required_steps(required_step_values, genome_property.threshold) else: genome_property_assignment = calculate_property_assignment_from_all_steps( list(current_step_assignments.values())) assignment_cache.cache_property_assignment(genome_property.id, genome_property_assignment) return genome_property_assignment
[docs]def assign_step(assignment_cache: AssignmentCache, step: Step): """ Assigns a result (YES, NO) to a functional element based on assignments of its functional elements. :param assignment_cache: A cache containing step and property assignments and InterPro member database matches. :param step: The current step element which needs assignment. :return: The assignment for the step. """ functional_elements = step.functional_elements functional_element_assignments = [] for element in functional_elements: element_assignment = assign_functional_element(assignment_cache, element) functional_element_assignments.append(element_assignment) step_assignment = calculate_step_or_functional_element_assignment(functional_element_assignments, sufficient_scheme=True) assignment_cache.cache_step_assignment(step.parent.id, step.number, step_assignment) return step_assignment
[docs]def assign_functional_element(assignment_cache: AssignmentCache, functional_element: FunctionalElement): """ Assigns a result (YES, NO) to a functional element based on assignments of its evidences. :param assignment_cache: A cache containing step and property assignments and InterPro member database matches. :param functional_element: The current functional_element which needs assignment. :return: The assignment for the functional element. """ element_evidences = functional_element.evidence evidence_assignments_and_sufficients = [] for current_evidence in element_evidences: evidence_assignment = assign_evidence(assignment_cache, current_evidence) sufficient = current_evidence.sufficient evidence_assignments_and_sufficients.append((evidence_assignment, sufficient)) sufficient_evidence_assignments = [assignment for assignment, sufficient in evidence_assignments_and_sufficients if sufficient] if sufficient_evidence_assignments: sufficient_assignment = calculate_step_or_functional_element_assignment(sufficient_evidence_assignments, sufficient_scheme=True) if sufficient_assignment == 'YES': functional_element_assignment = 'YES' else: functional_element_assignment = 'NO' else: evidence_assignments = [assignment for assignment, sufficient in evidence_assignments_and_sufficients] functional_element_assignment = calculate_step_or_functional_element_assignment(evidence_assignments) return functional_element_assignment
[docs]def assign_evidence(assignment_cache: AssignmentCache, current_evidence: Evidence): """ Assigns a result (YES, NO) to a evidence based of the presence or absence of InterPro member identifiers or the assignment of evidence child genome properties. :param assignment_cache: A cache containing step and property assignments and InterPro member database matches. :param current_evidence: The current evidence which needs assignment. :return: The assignment for the evidence. """ if current_evidence.has_genome_property: primary_genome_property = current_evidence.genome_properties[0] primary_property_identifier = primary_genome_property.id cached_property_assignment = assignment_cache.get_property_assignment(primary_property_identifier) if cached_property_assignment: evidence_assignment = cached_property_assignment else: evidence_genome_property_assignment = assign_genome_property(assignment_cache, primary_genome_property) evidence_assignment = evidence_genome_property_assignment else: unique_interpro_member_identifiers = assignment_cache.interpro_member_database_identifiers if unique_interpro_member_identifiers: if unique_interpro_member_identifiers.isdisjoint(set(current_evidence.evidence_identifiers)): evidence_assignment = 'NO' else: evidence_assignment = 'YES' else: evidence_assignment = 'NO' return evidence_assignment
[docs]def calculate_property_assignment_from_required_steps(required_step_assignments: list, threshold: int = 0): """ Takes the assignment results for each required step of a genome property and uses them to assign a result for the property itself. This is the classic algorithm used by EBI Genome Properties. From: https://genome-properties.readthedocs.io/en/latest/calculating.html To determine if the GP resolves to a YES (all required steps are present), NO (too few required steps are present) or PARTIAL (the number of required steps present is greater than the threshold, indicating that some evidence of the presence of the GP can be assumed). Child steps must be present ('YES') not partial. In Perl code for Genome Properties: Link: https://github.com/ebi-pf-team/genome-properties/blob/ a76a5c0284f6c38cb8f43676618cf74f64634d33/code/pygenprop/GenomeProperties.pm#L646 #Three possible results for the evaluation if($found == 0 or $found <= $def->threshold){ $def->result('NO'); #No required steps found }elsif($missing){ $def->result('PARTIAL'); #One or more required steps found, but one or more required steps missing }else{ $def->result('YES'); #All steps found. } If no required steps are found or the number found is less than or equal to the threshold --> No Else if any are missing --> PARTIAL ELSE (none are missing) --> YES So for problem space ALL_PRESENT > THRESHOLD > NONE_PRESENT: YES when ALL_PRESENT = CHILD_YES_COUNT PARTIAL when CHILD_YES_COUNT > THRESHOLD NO when CHILD_YES_COUNT <= THRESHOLD :param required_step_assignments: A list of assignment results for child steps or genome properties. :param threshold: The threshold of 'YES' assignments necessary for a 'PARTIAL' assignment. :return: The parent's assignment result. """ yes_count = required_step_assignments.count('YES') if yes_count == len(required_step_assignments): genome_property_result = 'YES' elif yes_count > threshold: genome_property_result = 'PARTIAL' else: genome_property_result = 'NO' return genome_property_result
[docs]def calculate_property_assignment_from_all_steps(child_assignments: list): """ Takes the assignment results from all child results and uses them to assign a result for the parent itself. This algorithm is used to assign results to a single step from child functional elements and for genome properties that have no required steps such as "category" type genome properties. This is a more generic version of the algorithm used in assign_property_result_from_required_steps() If all child assignments are No, parent should be NO. If all child assignments are Yes, parent should be YES. Any thing else in between, parents should be PARTIAL. :param child_assignments: A list of assignment results for child steps or genome properties. :return: The parents assignment result. """ yes_count = child_assignments.count('YES') no_count = child_assignments.count('NO') if yes_count == len(child_assignments): genome_property_result = 'YES' elif no_count == len(child_assignments): genome_property_result = 'NO' else: genome_property_result = 'PARTIAL' return genome_property_result
[docs]def calculate_step_or_functional_element_assignment(child_assignments: list, sufficient_scheme=False): """ Assigns a step result or functional element result based of the assignments of its children. In the case of steps, this would be functional element assignments. In the case of functional elements this would be evidences. For assignments from child genome properties YES or PARTIAL is considered YES. See: https://github.com/ebi-pf-team/genome-properties/blob/ a76a5c0284f6c38cb8f43676618cf74f64634d33/code/modules/GenomeProperties.pm#L686 if($evObj->gp){ if(defined($self->get_defs->{ $evObj->gp })){ # For properties a PARTIAL or YES result is considered success if( $self->get_defs->{ $evObj->gp }->result eq 'YES' or $self->get_defs->{ $evObj->gp }->result eq 'PARTIAL' ){ $succeed++; }elsif($self->get_defs->{ $evObj->gp }->result eq 'UNTESTED'){ $step->evaluated(0); :param sufficient_scheme: If false, any child NOs mean NO. If true, any child YES/PARTIAL means YES :param child_assignments: A list containing strings of YES, NO or PARTIAL :return: The assignment as either YES or NO. """ no_count = child_assignments.count('NO') # Given a list of sufficient evidences, any could be PARTIAL or YES and the result would be YES. if sufficient_scheme: if no_count < len(child_assignments): result = 'YES' else: result = 'NO' # Given a list of non-sufficient evidences, all evidences have to be YES or PARTIAL or the result would be NO. else: if no_count == 0: result = 'YES' else: result = 'NO' return result