"""Module containing a class for encoding a Phylogenetic tree into a matrix.
See:
Leibold, m.A., E.P. Economo and P.R. Peres-Neto. 2010. Metacommunity
phylogenetics: separating the roles of environmental filters and
historical biogeography. Ecology letters 13: 1290-1299.
Todo:
Use method to get labels by matrix index when available from tree.
"""
from random import shuffle
import numpy as np
from lmpy import Matrix, PhyloTreeKeys, TreeWrapper
# .............................................................................
[docs]class EncodingException(Exception):
"""Exception indicating there was a problem with encoding."""
# .............................................................................
[docs]class TreeEncoder:
"""A class representing encoding of tree to match PAM."""
# ..............................
def __init__(self, tree, pam):
"""Base constructor for tree encoder.
Args:
tree (TreeWrapper): A tree object to encode.
pam (Matrix): A PAM matrix object.
"""
self.tree = tree
if isinstance(pam, Matrix):
self.pam = pam
else:
self.pam = Matrix(pam)
# ..............................
@classmethod
[docs] def from_file(cls, tree_file_name, pam_file_name):
"""Creates an instance of the PhyloEncoding class from tree and pam.
Args:
tree_file_name (str): The location of the tree.
pam_file_name (str): The location of the PAM.
Returns:
TreeEncoder: A new tree encoder instance.
"""
tree = TreeWrapper.from_filename(tree_file_name)
pam = Matrix.load(pam_file_name)
return cls(tree, pam)
# ..............................
[docs] def encode_phylogeny(self):
"""Encode the phylogenetic tree into a matrix.
Note:
P in the literature, a tip (row) by internal node (column) matrix
that needs to match the provided PAM.
Todo:
Consider providing options for correcting the tree / pam
Raises:
EncodingException: Raised if the PAM and tree do not match.
Returns:
Matrix: An encoding of the phylogenetic tree.
"""
if self.validate(): # Make sure that the PAM and tree match
if self.tree.has_branch_lengths():
p_mtx = self._build_p_matrix_with_branch_lengths()
else:
p_mtx = self._build_p_matrix_no_branch_lengths()
else:
raise EncodingException("PAM and Tree do not match, fix before encoding")
return p_mtx
# ..............................
[docs] def validate(self):
"""Validates the tree / PAM combination.
Returns:
bool: Boolean indicating if the tree / pam is valid.
"""
# check if tree is ultrametric
if (
not self.tree.has_branch_lengths() or self.tree.is_ultrametric()
) and self.tree.is_binary():
# Check that matrix indices in tree match PAM
# List of matrix indices (based on PAM column count)
pam_mtx_indices = range(self.pam.shape[1])
# All matrix indices in tree
tree_mtx_indices = [
int(mtx_id)
for _, mtx_id in self.tree.get_annotations(PhyloTreeKeys.MTX_IDX)
if mtx_id is not None
]
# Find the intersection between the two lists
intersection = set(pam_mtx_indices) & set(tree_mtx_indices)
# This checks that there are no duplicates in either of the indices
# lists and that they overlap completely
if len(intersection) == len(pam_mtx_indices) and len(
pam_mtx_indices
) == len(tree_mtx_indices):
# If everything checks out to here, return true for valid
return True
# If anything does not validate, return false
return False
# ..............................
[docs] def _build_p_branch_length_values(self, node):
"""Recurse through the tree to get P matrix values for node / tips.
Args:
node (dendropy.Node): The current clade.
Returns:
tuple: A tuple of branch length dictionary, sum of branch lengths, and
p-values dictionary.
"""
# Dictionary of branch length lists, bottom-up from tip to current node
bl_dict = {}
clade_bl = node.edge_length # To avoid lookups
# This is the sum of all branch lengths in the clade and will be used
# as the denominator for the p value for each tip to this node. See
# the literature for more information. We will initialize with the
# branch length of this clade because it will always be added
# whether we are at a tip or an internal node.
bl_sum = clade_bl
# Dictionary of dictionaries of p values, first key is node path id,
# sub-keys are tip matrix indices for that node
p_vals_dict = {}
if node.num_child_nodes() > 0: # Assume this is two since binary
clade_p_vals = {}
multipliers = [-1.0, 1.0] # One positive, one negative
shuffle(multipliers)
for child in node.child_nodes():
(
child_bl_dict,
child_bl_sum,
child_p_val_dict,
) = self._build_p_branch_length_values(child)
multiplier = multipliers.pop(0)
# Extend the p values dictionary
p_vals_dict.update(child_p_val_dict)
# Add this child's branch length sum to the clade's branch
# length sum
bl_sum += child_bl_sum
child_bl = child.edge_length
# We will add this value to the branch length list for all of
# the tips in this clade. It is the branch length of this
# clade divided by the number of tips in the clade.
add_val = 1.0 * child_bl / len(child_bl_dict)
# Process eac of the tips in the child_bl_dict
for k, child_bl_k in child_bl_dict.items():
# The formula for calculating the p value is:
# P(tip)(node) = (l1 + l2/2 + l3/3 + ... ln/n) / sum of
# branch lengths to node
# The value is arbitrarily set to be positive for one
# child and negative for the other
# The 'l' term is the length of a branch and it is
# divided by the number of tips that share that
# branch.
tip_bls = child_bl_k + [add_val]
# Add the p value to p_vals_dict
clade_p_vals[k] = multiplier * sum(tip_bls) / child_bl_sum
# Add to bl_dict with this branch length
bl_dict[k] = tip_bls
p_vals_dict[node.label] = clade_p_vals
else: # We are at a tip
bl_dict = {node.taxon.annotations.get_value(PhyloTreeKeys.MTX_IDX): []}
return bl_dict, bl_sum, p_vals_dict
# ..............................
[docs] def _build_p_matrix_no_branch_lengths(self):
"""Creates a P matrix when no branch lengths are present.
Note:
For this method, we assume that there is a total weight of -1 to
the left and +1 to the right for each node. As we go down
(towards the tips) of the tree, we divide the proportion of
each previously visited node by 2. We then recurse with this
new visited list down the tree. Once we reach a tip, we can
return that list of proportions because it will match for that
tip for each of its ancestors.
Example:
3
+-- 2
| +-- 1
| | +-- 0
| | | +-- A
| | | +-- B
| | |
| | +-- C
| |
| +-- D
|
+--4
+-- E
+-- F
Step 1: (Node 3) []
- recurse left with [(3,-1)]
- recurse right with [(3,1)]
Step 2: (Node 2) [(3,-1)]
- recurse left with [(3,-.5),(2,-1)]
- recurse right with [(3,-.5),(2,1)]
Step 3: (Node 1)[(3,-.5),(2,-1)]
- recurse left with [(3,-.25),(2,-.5),(1,-1)]
- recurse right with [3,-.25),(2,-.5),(1,1)]
Step 4: (Node 0)[(3,-.25),(2,-.5),(1,-1)]
- recurse left with [(3,-.125),(2,-.25),(1,-.5),(0,-1)]
- recurse right with [(3,-.125),(2,-.25),(1,-.5),(0,1)]
Step 5: (Tip A) - Return [(3,-.125),(2,-.25),(1,-.5),(0,-1)]
Step 6: (Tip B) - Return [(3,-.125),(2,-.25),(1,-.5),(0,1)]
Step 7: (Tip C) - Return [(3,-.25),(2,-.5),(1,1)]
Step 8: (Tip D) - Return [(3,-.5),(2,1)]
Step 9: (Node 4) [(3,1)]
- recurse left with [(3,.5),(4,-1)]
- recurse right with [(3,.5),(4,1)]
Step 10: (Tip E) - Return [(3,.5),(4,-1)]
Step 11: (Tip F) - Return [(3,.5),(4,1)]
Creates matrix:
0 1 2 3 4
A -1.0 -0.5 -0.25 -0.125 0.0
B 1.0 -0.5 -0.25 -0.125 0.0
C 0.0 1.0 -0.5 -0.25 0.0
D 0.0 0.0 1.0 -0.5 0.0
E 0.0 0.0 0.0 0.5 -1.0
F 0.0 0.0 0.0 0.5 1.0
See:
Page 1293 of the literature.
Returns:
Matrix: An encoded phylogeny matrix.
"""
# .......................
# Initialize the matrix
internal_node_labels = [n.label for n in self.tree.nodes() if not n.is_leaf()]
labels = self.pam.get_column_headers()
# We need a mapping of node path id to matrix column. I don't think
# order matters
node_col_idx = dict(zip(internal_node_labels, range(len(internal_node_labels))))
mtx = Matrix(
np.zeros((len(labels), len(internal_node_labels)), dtype=float),
headers={'0': labels, '1': internal_node_labels},
)
# Get the list of tip proportion lists
# Note: Which tip each of the lists belongs to doesn't really matter
# but recursion will start at the top and go to the bottom of the
# tree tips.
self.tree.seed_node._set_edge_length(0.0)
tip_props = self._build_p_matrix_tip_proportion_list(
self.tree.seed_node, visited=[]
)
# The matrix index of the tip in the PAM maps to the row index of P
for row_idx, tip_prop in tip_props:
for node_clade_id, val in tip_prop:
mtx[row_idx][node_col_idx[node_clade_id]] = val
return mtx
# ..............................
[docs] def _build_p_matrix_tip_proportion_list(self, node, visited=None):
"""Builds a list of tip proportions for the p matrix w/o branch lengths.
Args:
node (dendropy.Node): The current clade.
visited (:obj:`None` or :obj:`list` of :obj:`tuple`): A list of
(node path id, proportion) tuples.
Note:
Proportion for each visited node is divided by two as we go
towards the tips at each hop.
Returns:
list: A list of tip proportions.
"""
if visited is None:
visited = [] # pragma: no cover
tip_props = []
if node.num_child_nodes() > 0: # Assume this is two since binary
# First divide all existing visited by two
new_visited = [(idx, val / 2) for idx, val in visited]
# Recurse. Left is negative, right is positive
left_child, right_child = node.child_nodes()
tip_props.extend(
self._build_p_matrix_tip_proportion_list(
left_child, visited=new_visited + [(node.label, -1.0)]
)
)
tip_props.extend(
self._build_p_matrix_tip_proportion_list(
right_child, visited=new_visited + [(node.label, 1.0)]
)
)
else:
tip_props.append(
(node.taxon.annotations.get_value(PhyloTreeKeys.MTX_IDX), visited)
)
return tip_props
# ..............................
[docs] def _build_p_matrix_with_branch_lengths(self):
"""Creates a P matrix when branch lengths are present.
Note:
For this method, we assume that there is a total weight of -1 to
the left and +1 to the right for each node. As we go down
(towards the tips) of the tree, we divide the proportion of
each previously visited node by 2. We then recurse with this
new visited list down the tree. Once we reach a tip, we can
return that list of proportions because it will match for that
tip for each of its ancestors.
Example:
3
+-- 2 (0.4)
| +-- 1 (0.15)
| | +-- 0 (0.65)
| | | +-- A (0.2)
| | | +-- B (0.2)
| | |
| | +-- C (0.85)
| |
| +-- D (1.0)
|
+-- 4 (0.9)
+-- E (0.5)
+-- F (0.5)
Value for any cell (tip)(node) = (l1 + l2/2 + l3/3 + ... + ln/n) /
(Sum of branch lengths in daughter clade)
ln / n -> The length of branch n divided by the number of tips that
descend from that clade
P(A)(2) = (0.2 + 0.65/2 + 0.15/3) /
(0.2 + 0.2 + 0.65 + 0.85 + 0.15)
= (0.2 + 0.325 + 0.05) / (2.05)
= 0.575 / 2.05
= 0.280
P(D)(3) = (1.0 + 0.4/4) / (.2 + .2 + .65 + .85 + .15 + 1.0 + .4)
= 1.1 / 3.45
= 0.319
Creates matrix:
0 1 2 3 4
A -1.000 -0.500 -0.280 -0.196 0.000
B 1.000 -0.500 -0.280 -0.196 0.000
C 0.000 1.000 -0.439 -0.290 0.000
D 0.000 0.000 1.000 -0.319 0.000
E 0.000 0.000 0.000 0.500 -1.000
F 0.000 0.000 0.000 0.500 1.000
See:
Literature supplemental material.
Returns:
Matrix: An encoded phylogeny matrix.
"""
# We only need the P-values dictionary
# TODO: Is there a better way to do this so the length of the r?
self.tree.seed_node._set_edge_length(0.0)
_, _, p_val_dict = self._build_p_branch_length_values(self.tree.seed_node)
# Initialize the matrix
labels = self.pam.get_column_headers()
mtx = Matrix(
np.zeros((len(labels), len(p_val_dict)), dtype=float),
headers={'0': labels, '1': list(p_val_dict.keys())},
)
# We need a mapping of node path id to matrix column. I don't think
# order matters
node_col_idx = dict(zip(p_val_dict.keys(), range(len(p_val_dict.keys()))))
for node_clade_id, p_val_node_val in p_val_dict.items():
for tip_mtx_idx, tip_p_vals in p_val_node_val.items():
mtx[int(tip_mtx_idx)][node_col_idx[node_clade_id]] = tip_p_vals
return mtx