tests/incite/collections/test_df_collection_base.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

from datetime import datetime, timezone
from typing import TYPE_CHECKING

import pandas as pd
import pytest
from pandera import DataFrameSchema

from generalresearch.incite.collections import (
    DFCollection,
    DFCollectionType,
)
from test_utils.incite.conftest import mnt_filepath

if TYPE_CHECKING:
    from generalresearch.incite.base import GRLDatasets

df_collection_types = [e for e in DFCollectionType if e is not DFCollectionType.TEST]


@pytest.mark.parametrize("df_coll_type", df_collection_types)
class TestDFCollectionBase:
    """None of these tests are about the DFCollection with any specific
    data_type... that will be handled in other parameterized tests

    """

    def test_init(self, mnt_filepath: "GRLDatasets", df_coll_type: DFCollectionType):
        """Try to initialize the DFCollection with various invalid parameters"""
        with pytest.raises(expected_exception=ValueError) as cm:
            DFCollection(archive_path=mnt_filepath.data_src)
        assert "Must explicitly provide a data_type" in str(cm.value)

        # with pytest.raises(expected_exception=ValueError) as cm:
        #     DFCollection(
        #         data_type=DFCollectionType.TEST, archive_path=mnt_filepath.data_src
        #     )
        # assert "Must provide a supported data_type" in str(cm.value)

        instance = DFCollection(
            data_type=DFCollectionType.WALL, archive_path=mnt_filepath.data_src
        )
        assert instance.data_type == DFCollectionType.WALL


@pytest.mark.parametrize("df_coll_type", df_collection_types)
class TestDFCollectionBaseProperties:

    @pytest.mark.skip
    def test_df_collection_items(self, mnt_filepath: "GRLDatasets", df_coll_type):
        instance = DFCollection(
            data_type=df_coll_type,
            start=datetime(year=1800, month=1, day=1, tzinfo=timezone.utc),
            finished=datetime(year=1900, month=1, day=1, tzinfo=timezone.utc),
            offset="100d",
            archive_path=mnt_filepath.archive_path(enum_type=df_coll_type),
        )

        assert len(instance.interval_range) == len(instance.items)
        assert len(instance.items) == 366

    def test_df_collection_progress(self, mnt_filepath: "GRLDatasets", df_coll_type):
        instance = DFCollection(
            data_type=df_coll_type,
            start=datetime(year=1800, month=1, day=1, tzinfo=timezone.utc),
            finished=datetime(year=1900, month=1, day=1, tzinfo=timezone.utc),
            offset="100d",
            archive_path=mnt_filepath.archive_path(enum_type=df_coll_type),
        )

        # Progress returns a dataframe with a row each Item
        assert isinstance(instance.progress, pd.DataFrame)
        assert instance.progress.shape == (366, 6)

    def test_df_collection_schema(self, mnt_filepath: "GRLDatasets", df_coll_type):
        instance1 = DFCollection(
            data_type=DFCollectionType.WALL, archive_path=mnt_filepath.data_src
        )

        instance2 = DFCollection(
            data_type=DFCollectionType.SESSION, archive_path=mnt_filepath.data_src
        )

        assert instance1._schema != instance2._schema
        assert isinstance(instance1._schema, DataFrameSchema)
        assert isinstance(instance2._schema, DataFrameSchema)


class TestDFCollectionBaseMethods:

    @pytest.mark.skip
    def test_initial_load(self, mnt_filepath: "GRLDatasets", thl_web_rr):
        instance = DFCollection(
            pg_config=thl_web_rr,
            data_type=DFCollectionType.USER,
            start=datetime(year=2022, month=1, day=1, minute=0, tzinfo=timezone.utc),
            finished=datetime(year=2022, month=1, day=1, minute=5, tzinfo=timezone.utc),
            offset="2min",
            archive_path=mnt_filepath.data_src,
        )

        # Confirm that there are no archives available yet
        assert instance.progress.has_archive.eq(False).all()

        instance.initial_load()
        assert 47 == len(instance.ddf().index)
        assert instance.progress.should_archive.eq(True).all()

        # A few archives should have been made
        assert not instance.progress.has_archive.eq(False).all()

    @pytest.mark.skip
    def test_fetch_force_rr_latest(self):
        pass

    @pytest.mark.skip
    def test_force_rr_latest(self):
        pass