Source code for pore_c.model

import logging
from enum import Enum
from typing import Dict, List, NewType, Optional

import numpy as np
import pandas as pd
import pyarrow as pa
import pysam
from pydantic import BaseModel, confloat, conint, constr

from .config import (
    ALIGN_IDX_DTYPE,
    ALIGN_SCORE_DTYPE,
    FRAG_IDX_DTYPE,
    GENOMIC_COORD_DTYPE,
    GENOMIC_DISTANCE_DTYPE,
    HAPLOTYPE_IDX_DTYPE,
    MQ_DTYPE,
    PERCENTAGE_DTYPE,
    PHASE_SET_DTYPE,
    READ_COORD_DTYPE,
    READ_DISTANCE_DTYPE,
    READ_IDX_DTYPE,
    SHORT_RANGE_CUTOFF,
    STRAND_DTYPE,
)
from .utils import mean_qscore

logger = logging.getLogger(__name__)


class _BaseModel(BaseModel):
    @classmethod
    def pandas_dtype(cls, overrides=None):
        res = {}
        if overrides is None:
            overrides = {}
        overrides = overrides if overrides is not None else {}
        schema = cls.schema()
        for column, col_schema in schema["properties"].items():
            if column in overrides:
                res[column] = overrides[column]
                continue
            if "$ref" in col_schema:
                def_key = col_schema["$ref"].rsplit("/", 1)[-1]
                col_schema = schema["definitions"][def_key]
            if "dtype" in col_schema:
                dtype = col_schema["dtype"]
            else:
                assert "enum" in col_schema
                dtype = pd.CategoricalDtype(col_schema["enum"], ordered=True)
            res[column] = dtype
        return res

    @classmethod
    def pyarrow_schema(cls, overrides=None):
        dtype = cls.pandas_dtype(overrides=overrides)
        df = pd.DataFrame(columns=dtype.keys(), dtype=dtype)
        return pa.Schema.from_pandas(df)

    def to_tuple(self):
        return tuple(_[1] for _ in self)

    @classmethod
    def to_dataframe(cls, data: List, overrides=Optional[Dict]):
        columns = [a[0] for a in data[0]]
        dtype = cls.pandas_dtype(overrides=overrides)
        df = pd.DataFrame([a.to_tuple() for a in data], columns=columns).astype(dtype)
        return df


[docs]class AlignmentType(str, Enum):
    unmapped = "unmapped"
    primary = "primary"
    secondary = "secondary"
    supplementary = "supplementary"


[docs]class FragmentRecord(_BaseModel):
    """Meta-data associated with a restriction fragment"""

    chrom: constr(min_length=1, strip_whitespace=True)
    start: conint(ge=0)
    end: conint(ge=0)
    fragment_id: conint(ge=1, strict=True)
    fragment_length: conint(ge=1, strict=True)

[docs]    class Config:
        use_enum_values = True
        fields = dict(
            chrom=dict(description="The chromosome/contig the fragment is derived from", dtype="category"),
            start=dict(
                description="The zero-based start position on the genome of the fragment", dtype=GENOMIC_COORD_DTYPE
            ),
            end=dict(
                description="The zero-based end position on the genome of the fragment", dtype=GENOMIC_COORD_DTYPE
            ),
            fragment_id=dict(description="Unique integer ID of the fragment, starts at 1", dtype=FRAG_IDX_DTYPE),
            fragment_length=dict(description="Length of the fragment", dtype=GENOMIC_COORD_DTYPE),
        )


[docs]class AlignmentRecord(_BaseModel):
    """An alignment derived from a BAM file"""

    read_idx: conint(ge=0, strict=True)
    align_idx: conint(ge=0, strict=True)
    align_type: AlignmentType
    chrom: constr(min_length=1, strip_whitespace=True)
    start: conint(ge=0)
    end: conint(ge=0)
    strand: STRAND_DTYPE
    read_name: constr(min_length=1, strip_whitespace=True)
    read_length: conint(ge=1)
    read_start: conint(ge=0)
    read_end: conint(ge=0)
    mapping_quality: conint(ge=0, le=255)
    align_score: conint(ge=0)
    align_base_qscore: conint(ge=0)
    phase_set: int = 0
    phase_qual: conint(ge=0) = 0
    haplotype: conint(ge=-1) = -1

[docs]    class Config:
        use_enum_values = True
        fields = dict(
            read_idx=dict(description="Unique integer ID of the read", dtype=READ_IDX_DTYPE),
            align_idx=dict(description="Unique integer ID of the aligned segment", dtype=ALIGN_IDX_DTYPE),
            align_type=dict(description="The type of alignment", dtype="category"),
            chrom=dict(description="The chromosome/contig the read is aligned to", dtype="category"),
            start=dict(
                description="The zero-based start position on the genome of the alignment", dtype=GENOMIC_COORD_DTYPE
            ),
            end=dict(description="The end position on the genome of the alignment", dtype=GENOMIC_COORD_DTYPE),
            strand=dict(description="The alignment strand", dtype="bool"),
            read_name=dict(description="The original read name", dtype="str"),
            read_length=dict(description="The length of the read in bases", dtype=READ_COORD_DTYPE),
            read_start=dict(description="The start coordinate on the read (0-based)", dtype=READ_COORD_DTYPE),
            read_end=dict(description="The end coordinate on the read (0-based)", dtype=READ_COORD_DTYPE),
            mapping_quality=dict(description="The mapping quality as calculated by the aligner", dtype=MQ_DTYPE),
            align_score=dict(description="The alignment score as calculated by the aligner", dtype=ALIGN_SCORE_DTYPE),
            align_base_qscore=dict(
                description="The mean read base score for the aligned segment (rounded to the nearest integer).",
                dtype=ALIGN_SCORE_DTYPE,
            ),
            phase_set=dict(
                description="The ID of the phase set, often this is the start position of the phase block",
                dtype=PHASE_SET_DTYPE,
            ),
            phase_qual=dict(description="The phred-scaled quality score of the haplotype assignment", dtype=MQ_DTYPE),
            haplotype=dict(
                description=(
                    "The id of the haplotype within this block, usually set to 1 or 2. "
                    "A value of -1 means that this alignment is unphased"
                ),
                dtype=HAPLOTYPE_IDX_DTYPE,
            ),
        )

[docs]    @classmethod
    def from_aligned_segment(cls, align: pysam.AlignedSegment) -> "AlignmentRecord":
        """Extract information from a pysam Aligned segment"""
        read_name, read_idx, align_idx = align.query_name.split(":")
        read_idx, align_idx = int(read_idx), int(align_idx)

        if align.is_unmapped:
            align_cat = "unmapped"
            chrom, start, end, align_score = "NULL", 0, 0, 0
            read_length = align.query_length
            quals = align.query_qualities
            # TODO: handle this more gracefully
            if quals is None:
                align_base_qscore = 0
            else:
                align_base_qscore = mean_qscore(np.array(align.query_qualities))
        else:
            chrom, start, end = (align.reference_name, align.reference_start, align.reference_end)
            read_length = align.infer_read_length()
            align_score = align.get_tag("AS")
            align_base_qscore = mean_qscore(np.array(align.query_alignment_qualities))
            if align.is_secondary:
                align_cat = "secondary"
            elif align.is_supplementary:
                align_cat = "supplementary"
            else:
                align_cat = "primary"

        optional = {}
        for key, tag in [("haplotype", "HP"), ("phase_set", "PS"), ("phase_qual", "PC")]:
            if align.has_tag(tag):
                optional[key] = int(align.get_tag(tag))
        return cls(
            read_idx=read_idx,
            align_idx=align_idx,
            align_type=align_cat,
            chrom=chrom,
            start=start,
            end=end,
            strand=not align.is_reverse,
            read_name=read_name,
            read_length=read_length,
            read_start=align.query_alignment_start,
            read_end=align.query_alignment_end,
            mapping_quality=align.mapq,
            align_score=align_score,
            align_base_qscore=np.rint(align_base_qscore),
            **optional,
        )

[docs]    @classmethod
    def to_dataframe(cls, aligns: List, chrom_order: List[str] = None):
        columns = [a[0] for a in aligns[0]]
        if chrom_order:
            overrides = {"chrom": pd.CategoricalDtype(chrom_order, ordered=True)}
        else:
            overrides = {}
        dtype = cls.pandas_dtype(overrides=overrides)
        df = pd.DataFrame([a.to_tuple() for a in aligns], columns=columns)
        df = df.astype(dtype)
        return df

[docs]    @staticmethod
    def update_dataframe_with_haplotypes(align_df, haplotype_df):
        if len(haplotype_df) == 0:
            logger.info("Aligment haplotypes dataframe is empty, haplotypes won't be added.")
            return align_df
        haplotype_df = (
            haplotype_df.join(
                haplotype_df["#readname"]
                .str.split(":", expand=True)
                .rename({0: "read_name", 1: "read_idx", 2: "align_idx"}, axis=1)
            )
            .rename(columns={"phaseset": "phase_set"})
            .replace(
                dict(
                    haplotype={"none": -1, "H1": 1, "H2": 2},
                    phase_set={"none": 0},
                )
            )
            .astype(
                {
                    "read_idx": READ_IDX_DTYPE,
                    "align_idx": ALIGN_IDX_DTYPE,
                    "haplotype": HAPLOTYPE_IDX_DTYPE,
                    "phase_set": PHASE_SET_DTYPE,
                }
            )
            .set_index(["read_idx", "align_idx"])
        )
        col_order = list(align_df.columns)
        align_df = align_df.set_index(["read_idx", "align_idx"])
        align_df["haplotype"] = -1
        align_df["phase_set"] = 0
        align_df["phase_qual"] = 0
        align_df.update(haplotype_df[["haplotype", "phase_set"]], overwrite=True, errors="ignore")
        align_df = align_df.reset_index()[col_order].astype(
            {"haplotype": HAPLOTYPE_IDX_DTYPE, "phase_set": PHASE_SET_DTYPE}
        )
        return align_df


[docs]class AlignmentFilterReason(str, Enum):
    null = "null"
    Pass = "pass"
    unmapped = "unmapped"
    singleton = "singleton"
    low_mq = "low_mq"
    short_overlap = "short_overlap"
    overlap_on_read = "overlap_on_read"
    not_on_shortest_path = "not_on_shortest_path"


[docs]class PoreCRecord(AlignmentRecord):
    """An aligned segment from a BAM file with additional Pore-C related fields"""

    pass_filter: bool = True
    filter_reason: AlignmentFilterReason = AlignmentFilterReason.null
    fragment_id: conint(ge=0) = 0
    num_contained_fragments: conint(ge=0) = 0
    num_overlapping_fragments: conint(ge=0) = 0
    overlap_length: conint(ge=0) = 0
    fragment_start: conint(ge=0) = 0
    fragment_end: conint(ge=0) = 0
    perc_of_alignment: confloat(ge=0, le=100) = 0.0
    perc_of_fragment: confloat(ge=0, le=100) = 0.0
    is_contained: bool = False

[docs]    class Config:
        use_enum_values = True
        fields = dict(
            pass_filter=dict(description="Boolean flag, true if alignment passes all filters", dtype="bool"),
            filter_reason=dict(
                description="If an alignment fails the filter the reason will be listed here", dtype="category"
            ),
            fragment_id=dict(
                description="The UID of the restriction fragment assigned to this alignment", dtype=FRAG_IDX_DTYPE
            ),
            num_contained_fragments=dict(
                description="The number of restriction fragments completely contained within this alignment",
                dtype="uint32",
            ),
            num_overlapping_fragments=dict(
                description="The number of restriction fragments overlapping this alignment", dtype="uint32"
            ),
            overlap_length=dict(
                description="The length of the overlap between alignment and fragment", dtype=GENOMIC_COORD_DTYPE
            ),
            fragment_start=dict(
                description="The start point on the genome of this restriction fragment", dtype=GENOMIC_COORD_DTYPE
            ),
            fragment_end=dict(
                description="The end point on the genome of this restriction fragment", dtype=GENOMIC_COORD_DTYPE
            ),
            perc_of_alignment=dict(
                description="The percentage of the aligned segment that overlaps the assigned fragment",
                dtype=PERCENTAGE_DTYPE,
            ),
            perc_of_fragment=dict(
                description="The percentage of the assigned restriction fragment that overlaps the aligned segment",
                dtype=PERCENTAGE_DTYPE,
            ),
            is_contained=dict(
                description="Boolean flag to inidicate if the alignment is fully contained with the fragment",
                dtype="bool",
            ),
        )

[docs]    @classmethod
    def init_dataframe(cls, align_df: "AlignmentRecordDf") -> "PoreCRecordDf":
        res = align_df.copy()
        schema = cls.schema()

        col_schema = schema["properties"]

        for key, val in col_schema.items():
            if "$ref" in val:
                def_key = val["$ref"].rsplit("/", 1)[-1]
                col_schema[key] = schema["definitions"][def_key]

        dtype = cls.pandas_dtype()
        additional_fields = set(dtype.keys()) - (set(align_df.index.names) | set(align_df.columns))
        num_rows = len(res)
        for column in [c for c in col_schema if c in additional_fields]:
            cs = col_schema[column]
            if "default" in cs:
                default_value = cs["default"]
            elif "enum" in cs:
                default_value = cs["enum"][0]
            else:
                raise ValueError(cs)
            res[column] = pd.Series([default_value] * num_rows, index=res.index).astype(dtype[column])
        return res


[docs]class HaplotypePairType(str, Enum):
    null = "null"
    trans = "trans"
    unphased = "unphased"
    semi_phased = "semi_phased"
    phased_sets_differ = "phased_sets_differ"
    phased_h_cis = "phased_h_cis"
    phased_h_trans = "phased_h_trans"


[docs]class PoreCContactRecord(_BaseModel):
    read_name: constr(min_length=1, strip_whitespace=True)
    read_length: conint(ge=1)
    read_idx: conint(ge=0, strict=True)
    contact_is_direct: bool = False
    contact_is_cis: bool = False
    contact_read_distance: int = 0
    contact_genome_distance: int = 0
    contact_fragment_adjacent: bool = False
    contact_fragment_distance: conint(ge=0, strict=True)
    haplotype_pair_type: HaplotypePairType = HaplotypePairType.null
    align1_align_idx: conint(ge=0, strict=True)
    align1_chrom: constr(min_length=1, strip_whitespace=True)
    align1_start: conint(ge=0)
    align1_end: conint(ge=0)
    align1_strand: STRAND_DTYPE
    align1_mapping_quality: conint(ge=0, le=255)
    align1_align_score: conint(ge=0)
    align1_align_base_qscore: conint(ge=0)
    align1_phase_set: int = 0
    align1_phase_qual: int = 0
    align1_haplotype: conint(ge=-1) = -1
    align1_fragment_id: conint(ge=0) = 0
    align1_fragment_start: conint(ge=0) = 0
    align1_fragment_end: conint(ge=0) = 0
    align2_align_idx: conint(ge=0, strict=True)
    align2_chrom: constr(min_length=1, strip_whitespace=True)
    align2_start: conint(ge=0)
    align2_end: conint(ge=0)
    align2_strand: STRAND_DTYPE
    align2_mapping_quality: conint(ge=0, le=255)
    align2_align_score: conint(ge=0)
    align2_align_base_qscore: conint(ge=0)
    align2_phase_set: int = 0
    align1_phase_qual: int = 0
    align2_haplotype: conint(ge=-1) = -1
    align2_fragment_id: conint(ge=0) = 0
    align2_fragment_start: conint(ge=0) = 0
    align2_fragment_end: conint(ge=0) = 0

[docs]    class Config:
        use_enum_values = True
        fields = dict(
            read_name=dict(description="The original read name", dtype="str"),
            read_length=dict(description="The length of the read in bases", dtype=READ_COORD_DTYPE),
            read_idx=dict(description="Unique integer ID of the read", dtype=READ_IDX_DTYPE),
            contact_is_direct=dict(
                description="There are no intervening assigned restriction fragments on the read", dtype="bool"
            ),
            contact_is_cis=dict(description="Both alignments come from the same chromsome/contig", dtype="bool"),
            contact_read_distance=dict(
                description=(
                    "The distance between the end of the left alignment and the start of the right "
                    "alignment on the read"
                ),
                dtype=READ_DISTANCE_DTYPE,
            ),
            contact_genome_distance=dict(
                description=(
                    "The distance between the end of the left alignment and the start of the right alignment "
                    "(valid for cis contacts only)"
                ),
                dtype=GENOMIC_DISTANCE_DTYPE,
            ),
            contact_fragment_adjacent=dict(
                description=("A boolean to indicate if the contact is between the same or adjacent fragments",),
                dtype="bool",
            ),
            contact_fragment_distance=dict(
                description=(
                    "The distance between the midpoints of the assigned fragments (valid for cis contacts only)"
                ),
                dtype=GENOMIC_DISTANCE_DTYPE,
            ),
            haplotype_pair_type=dict(
                description=(
                    "A categorical variable describing the relationship between the haplotypes assigned to each of the "
                    "alignments in a contact",
                ),
                dtype="category",
            ),
            align1_align_idx=dict(description="Unique integer ID of the first aligned segment", dtype=ALIGN_IDX_DTYPE),
            align1_chrom=dict(description="The chromosome/contig of the first aligned segment", dtype="category"),
            align1_start=dict(
                description="The zero-based start position on the genome of the alignment", dtype=GENOMIC_COORD_DTYPE
            ),
            align1_end=dict(description="The end position on the genome of the alignment", dtype=GENOMIC_COORD_DTYPE),
            align1_strand=dict(description="The alignment strand", dtype="bool"),
            align1_mapping_quality=dict(description="The mapping quality as calculated by the aligner", dtype=MQ_DTYPE),
            align1_align_score=dict(
                description="The alignment score as calculated by the aligner", dtype=ALIGN_SCORE_DTYPE
            ),
            align1_align_base_qscore=dict(
                description="The mean read base score for the aligned segment (rounded to the nearest integer).",
                dtype=ALIGN_SCORE_DTYPE,
            ),
            align1_phase_set=dict(
                description="The ID of the phase set, often this is the start position of the phase block",
                dtype=PHASE_SET_DTYPE,
            ),
            align1_phase_qual=dict(
                description="The phred-scaled quality score of the haplotype assignment", dtype=MQ_DTYPE
            ),
            align1_haplotype=dict(
                description=(
                    "The id of the haplotype within this block, usually set to 1 or 2. "
                    "A value of -1 means that this alignment is unphased"
                ),
                dtype=HAPLOTYPE_IDX_DTYPE,
            ),
            align1_fragment_id=dict(
                description="The UID of the restriction fragment assigned to this alignment", dtype=FRAG_IDX_DTYPE
            ),
            align1_fragment_start=dict(
                description="The start point on the genome of this restriction fragment", dtype=GENOMIC_COORD_DTYPE
            ),
            align1_fragment_end=dict(
                description="The end point on the genome of this restriction fragment", dtype=GENOMIC_COORD_DTYPE
            ),
            align2_align_idx=dict(description="Unique integer ID of the first aligned segment", dtype=ALIGN_IDX_DTYPE),
            align2_chrom=dict(description="The chromosome/contig of the first aligned segment", dtype="category"),
            align2_start=dict(
                description="The zero-based start position on the genome of the alignment", dtype=GENOMIC_COORD_DTYPE
            ),
            align2_end=dict(description="The end position on the genome of the alignment", dtype=GENOMIC_COORD_DTYPE),
            align2_strand=dict(description="The alignment strand", dtype="bool"),
            align2_mapping_quality=dict(description="The mapping quality as calculated by the aligner", dtype=MQ_DTYPE),
            align2_align_score=dict(
                description="The alignment score as calculated by the aligner", dtype=ALIGN_SCORE_DTYPE
            ),
            align2_align_base_qscore=dict(
                description="The mean read base score for the aligned segment (rounded to the nearest integer).",
                dtype=ALIGN_SCORE_DTYPE,
            ),
            align2_phase_set=dict(
                description="The ID of the phase set, often this is the start position of the phase block",
                dtype=PHASE_SET_DTYPE,
            ),
            align2_phase_qual=dict(
                description="The phred-scaled quality score of the haplotype assignment", dtype=MQ_DTYPE
            ),
            align2_haplotype=dict(
                description=(
                    "The id of the haplotype within this block, usually set to 1 or 2. "
                    "A value of -1 means that this alignment is unphased"
                ),
                dtype=HAPLOTYPE_IDX_DTYPE,
            ),
            align2_fragment_id=dict(
                description="The UID of the restriction fragment assigned to this alignment", dtype=FRAG_IDX_DTYPE
            ),
            align2_fragment_start=dict(
                description="The start point on the genome of this restriction fragment", dtype=GENOMIC_COORD_DTYPE
            ),
            align2_fragment_end=dict(
                description="The end point on the genome of this restriction fragment", dtype=GENOMIC_COORD_DTYPE
            ),
        )

[docs]    @classmethod
    def from_pore_c_align_pair(cls, read_name: str, read_length: int, read_idx: int, align1, align2):
        contact_read_distance = align2.read_start - align1.read_end
        contact_is_direct = align2.pos_on_read - align1.pos_on_read == 1
        if align1.fragment_id > align2.fragment_id:
            align1, align2 = align2, align1

        contact_is_cis = align1.chrom == align2.chrom
        if contact_is_cis:
            contact_genome_distance = align2.start - align1.end
            contact_fragment_distance = align2.fragment_midpoint - align1.fragment_midpoint
            contact_fragment_adjacent = abs(align2.fragment_id - align1.fragment_id) <= 1
        else:
            contact_genome_distance = 0
            contact_fragment_distance = 0
            contact_fragment_adjacent = False

        haplotype_pair_type = HaplotypePairType.null
        if not contact_is_cis:
            haplotype_pair_type = HaplotypePairType.trans
        elif align1.haplotype == -1 and align2.haplotype == -1:
            haplotype_pair_type = HaplotypePairType.unphased
        elif align1.haplotype == -1 or align2.haplotype == -1:
            haplotype_pair_type = HaplotypePairType.semi_phased
        elif align1.phase_set != align2.phase_set:
            haplotype_pair_type = HaplotypePairType.phased_sets_differ
        elif align1.haplotype == align2.haplotype:
            haplotype_pair_type = HaplotypePairType.phased_h_cis
        else:
            haplotype_pair_type = HaplotypePairType.phased_h_trans

        return cls(
            read_name=read_name,
            read_length=read_length,
            read_idx=read_idx,
            contact_is_direct=contact_is_direct,
            contact_is_cis=contact_is_cis,
            contact_read_distance=contact_read_distance,
            contact_genome_distance=contact_genome_distance,
            contact_fragment_adjacent=contact_fragment_adjacent,
            contact_fragment_distance=contact_fragment_distance,
            haplotype_pair_type=haplotype_pair_type,
            align1_align_idx=align1.align_idx,
            align1_chrom=align1.chrom,
            align1_start=align1.start,
            align1_end=align1.end,
            align1_strand=align1.strand,
            align1_mapping_quality=align1.mapping_quality,
            align1_align_score=align1.align_score,
            align1_align_base_qscore=align1.align_base_qscore,
            align1_phase_set=align1.phase_set,
            align1_phase_qual=align1.phase_qual,
            align1_haplotype=align1.haplotype,
            align1_fragment_id=align1.fragment_id,
            align1_fragment_start=align1.fragment_start,
            align1_fragment_end=align1.fragment_end,
            align2_align_idx=align2.align_idx,
            align2_chrom=align2.chrom,
            align2_start=align2.start,
            align2_end=align2.end,
            align2_strand=align2.strand,
            align2_mapping_quality=align2.mapping_quality,
            align2_align_score=align2.align_score,
            align2_align_base_qscore=align2.align_base_qscore,
            align2_phase_set=align2.phase_set,
            align2_phase_qual=align2.phase_qual,
            align2_haplotype=align2.haplotype,
            align2_fragment_id=align2.fragment_id,
            align2_fragment_start=align2.fragment_start,
            align2_fragment_end=align2.fragment_end,
        )


[docs]class PoreCConcatemerRecord(_BaseModel):
    read_name: constr(min_length=1, strip_whitespace=True)
    read_length: conint(ge=1)
    read_idx: conint(ge=0, strict=True)
    read_order: conint(ge=0, strict=True)
    num_fragments: conint(ge=0, strict=True)
    total_contacts: conint(ge=0, strict=True)
    total_cis_contacts: conint(ge=0, strict=True)
    total_trans_contacts: conint(ge=0, strict=True)
    total_short_range_cis_contacts: conint(ge=0, strict=True)
    total_long_range_cis_contacts: conint(ge=0, strict=True)
    direct_contacts: conint(ge=0, strict=True)
    direct_cis_contacts: conint(ge=0, strict=True)
    direct_trans_contacts: conint(ge=0, strict=True)
    direct_short_range_cis_contacts: conint(ge=0, strict=True)
    direct_long_range_cis_contacts: conint(ge=0, strict=True)
    indirect_contacts: conint(ge=0, strict=True)
    indirect_cis_contacts: conint(ge=0, strict=True)
    indirect_trans_contacts: conint(ge=0, strict=True)
    indirect_short_range_cis_contacts: conint(ge=0, strict=True)
    indirect_long_range_cis_contacts: conint(ge=0, strict=True)
    haplotype_phased_h_cis: conint(ge=0, strict=True)
    haplotype_phased_h_trans: conint(ge=0, strict=True)
    haplotype_phased_sets_differ: conint(ge=0, strict=True)
    haplotype_semi_phased: conint(ge=0, strict=True)
    haplotype_unphased: conint(ge=0, strict=True)
    max_indirect_contact_genome_distance: conint(ge=0, strict=True)
    max_direct_contact_genome_distance: conint(ge=0, strict=True)
    max_indirect_contact_fragment_distance: conint(ge=0, strict=True)
    max_direct_contact_fragment_distance: conint(ge=0, strict=True)

[docs]    class Config:
        use_enum_values = True
        fields = dict(
            read_name=dict(description="The original read name", dtype="str"),
            read_length=dict(description="The length of the read in bases", dtype=READ_COORD_DTYPE),
            read_idx=dict(description="Unique integer ID of the read", dtype=READ_IDX_DTYPE),
            read_order=dict(description="The number of monomers for this read", dtype="uint32"),
            total_contacts=dict(
                description="The total (direct + indirect) number of contacts for this read", dtype="uint32"
            ),
            direct_contacts=dict(
                description="The total number direct (adjacent on read) contacts for this read", dtype="uint32"
            ),
            indirect_contacts=dict(
                description="The total number indirect (non-adjacent on read) contacts for this read", dtype="uint32"
            ),
            total_cis_contacts=dict(
                description="The total number of cis-contacts (direct + indirect) for this read", dtype="uint32"
            ),
            total_trans_contacts=dict(
                description="The total number of trans-contacts (direct + indirect) for this read", dtype="uint32"
            ),
            direct_cis_contacts=dict(description="The number of direct cis-contacts for this read", dtype="uint32"),
            direct_trans_contacts=dict(description="The number of direct trans-contacts for this read", dtype="uint32"),
            indirect_cis_contacts=dict(description="The number of indirect cis-contacts for this read", dtype="uint32"),
            indirect_trans_contacts=dict(
                description="The number of indirect trans-contacts for this read", dtype="uint32"
            ),
            total_short_range_cis_contacts=dict(
                description=f"The total number of cis contacts < {SHORT_RANGE_CUTOFF} bases apart for this read",
                dtype="uint32",
            ),
            total_long_range_cis_contacts=dict(
                description=f"The total number of cis contacts >= {SHORT_RANGE_CUTOFF} bases apart for this read",
                dtype="uint32",
            ),
            direct_short_range_cis_contacts=dict(
                description=f"The number of direct cis contacts < {SHORT_RANGE_CUTOFF} bases apart for this read",
                dtype="uint32",
            ),
            direct_long_range_cis_contacts=dict(
                description=f"The number of direct cis contacts >= {SHORT_RANGE_CUTOFF} bases apart for this read",
                dtype="uint32",
            ),
            indirect_short_range_cis_contacts=dict(
                description=f"The number of indirect cis contacts < {SHORT_RANGE_CUTOFF} bases apart for this read",
                dtype="uint32",
            ),
            indirect_long_range_cis_contacts=dict(
                description=f"The number of indirect cis contacts >= {SHORT_RANGE_CUTOFF} bases apart for this read",
                dtype="uint32",
            ),
            haplotype_unphased=dict(
                description="The number of cis contacts where both members of the pair are unphased", dtype="uint32"
            ),
            haplotype_semi_phased=dict(
                description="The number of cis contacts where one member of the pair is unphased", dtype="uint32"
            ),
            haplotype_phased_sets_differ=dict(
                description=(
                    "The number of cis contacts where both members of the pair are phased but the phase sets differ"
                ),
                dtype="uint32",
            ),
            haplotype_phased_h_trans=dict(
                description=(
                    "The number of cis contacts where both members of the pair are phased, are part of the same phase "
                    "group, but the haplotypes differ"
                ),
                dtype="uint32",
            ),
            haplotype_phased_h_cis=dict(
                description=(
                    "The number of cis contacts where both members of the pair are phased, are part of the same phase "
                    "group, and the haplotypes agree"
                ),
                dtype="uint32",
            ),
            max_direct_contact_fragment_distance=dict(
                description=("The longest distance between fragment midpoints for all direct contacts",),
                dtype=GENOMIC_DISTANCE_DTYPE,
            ),
            max_indirect_contact_fragment_distance=dict(
                description=("The longest distance between fragment midpoints for all indirect contacts",),
                dtype=GENOMIC_DISTANCE_DTYPE,
            ),
            max_direct_contact_genome_distance=dict(
                description=("The longest distance between alignment endpoints for all direct contacts",),
                dtype=GENOMIC_DISTANCE_DTYPE,
            ),
            max_indirect_contact_genome_distance=dict(
                description=("The longest distance between alignment endpoints for all indirect contacts",),
                dtype=GENOMIC_DISTANCE_DTYPE,
            ),
            num_fragments=dict(
                description=("The number of unique restriction fragments  represented in the concatemer"),
                dtype="uint32",
            ),
        )


AlignmentRecordDf = NewType("AlignmentRecordDf", pd.DataFrame)
FragmentRecordDf = NewType("FragmentRecordDf", pd.DataFrame)
PoreCRecordDf = NewType("PoreCRecordDf", pd.DataFrame)
PoreCContactRecordDf = NewType("PoreCContactRecordDf", pd.DataFrame)
PoreCConcatemerRecordDf = NewType("PoreCConcatemerRecordDf", pd.DataFrame)

Chrom = NewType("Chrom", str)
Pore-C Tools

Source code for pore_c.model