aboutsummaryrefslogtreecommitdiff
path: root/tests/incite/collections/test_df_collection_item_thl_web.py
diff options
context:
space:
mode:
authorMax Nanis2026-03-09 06:27:14 -0400
committerMax Nanis2026-03-09 06:27:14 -0400
commit2f92429a68ec7209059d2d18fe67964c8dd57cf2 (patch)
tree641358598982860f6452d27a74cae809b0d2d430 /tests/incite/collections/test_df_collection_item_thl_web.py
parentce291a165fab6b6dc9f053c7b75a699d0fdf389f (diff)
downloadgeneralresearch-2f92429a68ec7209059d2d18fe67964c8dd57cf2.tar.gz
generalresearch-2f92429a68ec7209059d2d18fe67964c8dd57cf2.zip
Simple typing changes, Ruff import formatter. p3
Diffstat (limited to 'tests/incite/collections/test_df_collection_item_thl_web.py')
-rw-r--r--tests/incite/collections/test_df_collection_item_thl_web.py241
1 files changed, 132 insertions, 109 deletions
diff --git a/tests/incite/collections/test_df_collection_item_thl_web.py b/tests/incite/collections/test_df_collection_item_thl_web.py
index 9c3d67a..29f3677 100644
--- a/tests/incite/collections/test_df_collection_item_thl_web.py
+++ b/tests/incite/collections/test_df_collection_item_thl_web.py
@@ -1,7 +1,8 @@
-from datetime import datetime, timezone, timedelta
+from datetime import datetime, timedelta, timezone
from itertools import product as iter_product
from os.path import join as pjoin
-from pathlib import PurePath, Path
+from pathlib import Path, PurePath
+from typing import TYPE_CHECKING, Callable
from uuid import uuid4
import dask.dataframe as dd
@@ -11,13 +12,13 @@ from distributed import Client, Scheduler, Worker
# noinspection PyUnresolvedReferences
from distributed.utils_test import (
- gen_cluster,
+ cleanup,
+ client,
client_no_amm,
+ cluster_fixture,
+ gen_cluster,
loop,
loop_in_thread,
- cleanup,
- cluster_fixture,
- client,
)
from faker import Faker
from pandera import DataFrameSchema
@@ -29,10 +30,14 @@ from generalresearch.incite.collections import (
DFCollectionType,
)
from generalresearch.incite.schemas import ARCHIVE_AFTER
+from generalresearch.models.thl.product import Product
from generalresearch.models.thl.user import User
from generalresearch.pg_helper import PostgresConfig
from generalresearch.sql_helper import PostgresDsn
-from test_utils.incite.conftest import mnt_filepath, incite_item_factory
+from test_utils.incite.conftest import incite_item_factory, mnt_filepath
+
+if TYPE_CHECKING:
+ from generalresearch.incite.base import GRLDatasets
fake = Faker()
@@ -71,7 +76,7 @@ class TestDFCollectionItemBase:
)
class TestDFCollectionItemProperties:
- def test_filename(self, df_collection_data_type, df_collection, offset):
+ def test_filename(self, df_collection_data_type, df_collection, offset: str):
for i in df_collection.items:
assert isinstance(i.filename, str)
@@ -88,35 +93,37 @@ class TestDFCollectionItemProperties:
)
class TestDFCollectionItemPropertiesBase:
- def test_name(self, df_collection_data_type, offset, df_collection):
+ def test_name(self, df_collection_data_type, offset: str, df_collection):
for i in df_collection.items:
assert isinstance(i.name, str)
- def test_finish(self, df_collection_data_type, offset, df_collection):
+ def test_finish(self, df_collection_data_type, offset: str, df_collection):
for i in df_collection.items:
assert isinstance(i.finish, datetime)
- def test_interval(self, df_collection_data_type, offset, df_collection):
+ def test_interval(self, df_collection_data_type, offset: str, df_collection):
for i in df_collection.items:
assert isinstance(i.interval, pd.Interval)
- def test_partial_filename(self, df_collection_data_type, offset, df_collection):
+ def test_partial_filename(
+ self, df_collection_data_type, offset: str, df_collection
+ ):
for i in df_collection.items:
assert isinstance(i.partial_filename, str)
- def test_empty_filename(self, df_collection_data_type, offset, df_collection):
+ def test_empty_filename(self, df_collection_data_type, offset: str, df_collection):
for i in df_collection.items:
assert isinstance(i.empty_filename, str)
- def test_path(self, df_collection_data_type, offset, df_collection):
+ def test_path(self, df_collection_data_type, offset: str, df_collection):
for i in df_collection.items:
assert isinstance(i.path, FilePath)
- def test_partial_path(self, df_collection_data_type, offset, df_collection):
+ def test_partial_path(self, df_collection_data_type, offset: str, df_collection):
for i in df_collection.items:
assert isinstance(i.partial_path, FilePath)
- def test_empty_path(self, df_collection_data_type, offset, df_collection):
+ def test_empty_path(self, df_collection_data_type, offset: str, df_collection):
for i in df_collection.items:
assert isinstance(i.empty_path, FilePath)
@@ -136,9 +143,9 @@ class TestDFCollectionItemMethod:
def test_has_mysql(
self,
df_collection,
- thl_web_rr,
- offset,
- duration,
+ thl_web_rr: PostgresConfig,
+ offset: str,
+ duration: timedelta,
df_collection_data_type,
delete_df_collection,
):
@@ -166,9 +173,9 @@ class TestDFCollectionItemMethod:
def test_update_partial_archive(
self,
df_collection,
- offset,
- duration,
- thl_web_rw,
+ offset: str,
+ duration: timedelta,
+ thl_web_rw: PostgresConfig,
df_collection_data_type,
delete_df_collection,
):
@@ -181,26 +188,26 @@ class TestDFCollectionItemMethod:
def test_create_partial_archive(
self,
df_collection,
- offset,
- duration,
+ offset: str,
+ duration: str,
create_main_accounts,
- thl_web_rw,
+ thl_web_rw: PostgresConfig,
thl_lm,
df_collection_data_type,
- user_factory,
- product,
+ user_factory: Callable[..., User],
+ product: Product,
client_no_amm,
incite_item_factory,
delete_df_collection,
- mnt_filepath,
+ mnt_filepath: "GRLDatasets",
):
assert 1 + 1 == 2
def test_dict(
self,
df_collection_data_type,
- offset,
- duration,
+ offset: str,
+ duration: timedelta,
df_collection,
delete_df_collection,
):
@@ -224,12 +231,12 @@ class TestDFCollectionItemMethod:
self,
df_collection_data_type,
df_collection,
- offset,
- duration,
+ offset: str,
+ duration: timedelta,
create_main_accounts,
- thl_web_rw,
- user_factory,
- product,
+ thl_web_rw: PostgresConfig,
+ user_factory: Callable[..., User],
+ product: Product,
incite_item_factory,
delete_df_collection,
):
@@ -270,10 +277,10 @@ class TestDFCollectionItemMethod:
self,
df_collection_data_type,
df_collection,
- offset,
- duration,
- user_factory,
- product,
+ offset: str,
+ duration: timedelta,
+ user_factory: Callable[..., User],
+ product: Product,
incite_item_factory,
delete_df_collection,
):
@@ -316,15 +323,15 @@ class TestDFCollectionItemMethod:
def test_from_mysql_ledger(
self,
df_collection,
- user,
+ user: User,
create_main_accounts,
- offset,
- duration,
- thl_web_rw,
+ offset: str,
+ duration: timedelta,
+ thl_web_rw: PostgresConfig,
thl_lm,
df_collection_data_type,
- user_factory,
- product,
+ user_factory: Callable[..., User],
+ product: Product,
client_no_amm,
incite_item_factory,
delete_df_collection,
@@ -371,12 +378,12 @@ class TestDFCollectionItemMethod:
def test_to_archive(
self,
df_collection,
- user,
- offset,
- duration,
+ user: User,
+ offset: str,
+ duration: timedelta,
df_collection_data_type,
- user_factory,
- product,
+ user_factory: Callable[..., User],
+ product: Product,
client_no_amm,
incite_item_factory,
delete_df_collection,
@@ -410,12 +417,12 @@ class TestDFCollectionItemMethod:
self,
df_collection_data_type,
df_collection,
- user_factory,
- product,
- offset,
- duration,
+ user_factory: Callable[..., User],
+ product: Product,
+ offset: str,
+ duration: timedelta,
client_no_amm,
- user,
+ user: User,
incite_item_factory,
delete_df_collection,
mnt_filepath,
@@ -481,19 +488,19 @@ class TestDFCollectionItemMethod:
@pytest.mark.skip
def test_to_archive_numbered_partial(
- self, df_collection_data_type, df_collection, offset, duration
+ self, df_collection_data_type, df_collection, offset: str, duration: timedelta
):
pass
@pytest.mark.skip
def test_initial_load(
- self, df_collection_data_type, df_collection, offset, duration
+ self, df_collection_data_type, df_collection, offset: str, duration: timedelta
):
pass
@pytest.mark.skip
def test_clear_corrupt_archive(
- self, df_collection_data_type, df_collection, offset, duration
+ self, df_collection_data_type, df_collection, offset: str, duration: timedelta
):
pass
@@ -505,28 +512,36 @@ class TestDFCollectionItemMethod:
class TestDFCollectionItemMethodBase:
@pytest.mark.skip
- def test_path_exists(self, df_collection_data_type, offset, duration):
+ def test_path_exists(
+ self, df_collection_data_type, offset: str, duration: timedelta
+ ):
pass
@pytest.mark.skip
- def test_next_numbered_path(self, df_collection_data_type, offset, duration):
+ def test_next_numbered_path(
+ self, df_collection_data_type, offset: str, duration: timedelta
+ ):
pass
@pytest.mark.skip
def test_search_highest_numbered_path(
- self, df_collection_data_type, offset, duration
+ self, df_collection_data_type, offset: str, duration: timedelta
):
pass
@pytest.mark.skip
- def test_tmp_filename(self, df_collection_data_type, offset, duration):
+ def test_tmp_filename(
+ self, df_collection_data_type, offset: str, duration: timedelta
+ ):
pass
@pytest.mark.skip
- def test_tmp_path(self, df_collection_data_type, offset, duration):
+ def test_tmp_path(self, df_collection_data_type, offset: str, duration: timedelta):
pass
- def test_is_empty(self, df_collection_data_type, df_collection, offset, duration):
+ def test_is_empty(
+ self, df_collection_data_type, df_collection, offset: str, duration: timedelta
+ ):
"""
test_has_empty was merged into this because item.has_empty is
an alias for is_empty.. or vis-versa
@@ -542,7 +557,7 @@ class TestDFCollectionItemMethodBase:
assert item.has_empty()
def test_has_partial_archive(
- self, df_collection_data_type, df_collection, offset, duration
+ self, df_collection_data_type, df_collection, offset: str, duration: timedelta
):
for item in df_collection.items:
assert not item.has_partial_archive()
@@ -550,7 +565,7 @@ class TestDFCollectionItemMethodBase:
assert item.has_partial_archive()
def test_has_archive(
- self, df_collection_data_type, df_collection, offset, duration
+ self, df_collection_data_type, df_collection, offset: str, duration: timedelta
):
for item in df_collection.items:
# (1) Originally, nothing exists... so let's just make a file and
@@ -587,7 +602,7 @@ class TestDFCollectionItemMethodBase:
assert item.has_archive(include_empty=True)
def test_delete_archive(
- self, df_collection_data_type, df_collection, offset, duration
+ self, df_collection_data_type, df_collection, offset: str, duration: timedelta
):
for item in df_collection.items:
item: DFCollectionItem
@@ -610,7 +625,7 @@ class TestDFCollectionItemMethodBase:
assert not item.partial_path.exists()
def test_should_archive(
- self, df_collection_data_type, df_collection, offset, duration
+ self, df_collection_data_type, df_collection, offset: str, duration: timedelta
):
schema: DataFrameSchema = df_collection._schema
aa = schema.metadata[ARCHIVE_AFTER]
@@ -627,11 +642,13 @@ class TestDFCollectionItemMethodBase:
assert not item.should_archive()
@pytest.mark.skip
- def test_set_empty(self, df_collection_data_type, df_collection, offset, duration):
+ def test_set_empty(
+ self, df_collection_data_type, df_collection, offset: str, duration: timedelta
+ ):
pass
def test_valid_archive(
- self, df_collection_data_type, df_collection, offset, duration
+ self, df_collection_data_type, df_collection, offset: str, duration: timedelta
):
# Originally, nothing has been saved or anything.. so confirm it
# always comes back as None
@@ -655,17 +672,19 @@ class TestDFCollectionItemMethodBase:
@pytest.mark.skip
def test_validate_df(
- self, df_collection_data_type, df_collection, offset, duration
+ self, df_collection_data_type, df_collection, offset: str, duration: timedelta
):
pass
@pytest.mark.skip
def test_from_archive(
- self, df_collection_data_type, df_collection, offset, duration
+ self, df_collection_data_type, df_collection, offset: str, duration: timedelta
):
pass
- def test__to_dict(self, df_collection_data_type, df_collection, offset, duration):
+ def test__to_dict(
+ self, df_collection_data_type, df_collection, offset: str, duration: timedelta
+ ):
for item in df_collection.items:
res = item._to_dict()
@@ -683,19 +702,19 @@ class TestDFCollectionItemMethodBase:
@pytest.mark.skip
def test_delete_partial(
- self, df_collection_data_type, df_collection, offset, duration
+ self, df_collection_data_type, df_collection, offset: str, duration: timedelta
):
pass
@pytest.mark.skip
def test_cleanup_partials(
- self, df_collection_data_type, df_collection, offset, duration
+ self, df_collection_data_type, df_collection, offset: str, duration: timedelta
):
pass
@pytest.mark.skip
def test_delete_dangling_partials(
- self, df_collection_data_type, df_collection, offset, duration
+ self, df_collection_data_type, df_collection, offset: str, duration: timedelta
):
pass
@@ -715,7 +734,7 @@ async def test_client(client, s, worker):
)
@gen_cluster(client=True, nthreads=[("127.0.0.1", 1)])
@pytest.mark.anyio
-async def test_client_parametrize(c, s, w, df_collection_data_type, offset):
+async def test_client_parametrize(c, s, w, df_collection_data_type, offset: str):
"""c,s,a are all required - the secondary Worker (b) is not required"""
assert isinstance(c, Client), f"c is not Client, it's {type(c)}"
@@ -740,16 +759,16 @@ class TestDFCollectionItemFunctionalTest:
def test_to_archive_and_ddf(
self,
df_collection_data_type,
- offset,
- duration,
+ offset: str,
+ duration: timedelta,
client_no_amm,
df_collection,
- user,
- user_factory,
- product,
+ user: User,
+ user_factory: Callable[..., User],
+ product: Product,
incite_item_factory,
delete_df_collection,
- mnt_filepath,
+ mnt_filepath: "GRLDatasets",
):
from generalresearch.models.thl.user import User
@@ -790,16 +809,16 @@ class TestDFCollectionItemFunctionalTest:
def test_filesize_estimate(
self,
df_collection,
- user,
- offset,
- duration,
+ user: User,
+ offset: str,
+ duration: timedelta,
client_no_amm,
- user_factory,
- product,
+ user_factory: Callable[..., User],
+ product: Product,
df_collection_data_type,
incite_item_factory,
delete_df_collection,
- mnt_filepath,
+ mnt_filepath: "GRLDatasets",
):
"""A functional test to write some Parquet files for the
DFCollection and then confirm that the files get written
@@ -809,9 +828,11 @@ class TestDFCollectionItemFunctionalTest:
(1) Validating their passing the pandera schema
(2) The file or dir has an expected size on disk
"""
+ import os
+
import pyarrow.parquet as pq
+
from generalresearch.models.thl.user import User
- import os
if df_collection.data_type in unsupported_mock_types:
return
@@ -838,14 +859,14 @@ class TestDFCollectionItemFunctionalTest:
self,
client_no_amm,
df_collection,
- user_factory,
- product,
- offset,
- duration,
+ user_factory: Callable[..., User],
+ product: Product,
+ offset: str,
+ duration: timedelta,
df_collection_data_type,
incite_item_factory,
delete_df_collection,
- mnt_filepath,
+ mnt_filepath: "GRLDatasets",
):
from generalresearch.models.thl.user import User
@@ -875,7 +896,9 @@ class TestDFCollectionItemFunctionalTest:
assert item.has_archive(include_empty=True)
@pytest.mark.skip
- def test_get_items(self, df_collection, product, offset, duration):
+ def test_get_items(
+ self, df_collection, product: Product, offset: str, duration: timedelta
+ ):
with pytest.warns(expected_warning=ResourceWarning) as cm:
df_collection.get_items_last365()
assert "DFCollectionItem has missing archives" in str(
@@ -892,11 +915,11 @@ class TestDFCollectionItemFunctionalTest:
df_collection,
incite_item_factory,
delete_df_collection,
- user_factory,
- product,
- offset,
- duration,
- mnt_filepath,
+ user_factory: Callable[..., User],
+ product: Product,
+ offset: str,
+ duration: timedelta,
+ mnt_filepath: "GRLDatasets",
):
"""Don't allow creating an archive for data that will likely be
overwritten or updated
@@ -934,10 +957,10 @@ class TestDFCollectionItemFunctionalTest:
df_collection,
incite_item_factory,
delete_df_collection,
- user,
- offset,
- duration,
- mnt_filepath,
+ user: User,
+ offset: str,
+ duration: timedelta,
+ mnt_filepath: "GRLDatasets",
):
delete_df_collection(coll=df_collection)
@@ -962,10 +985,10 @@ class TestDFCollectionItemFunctionalTest:
df_collection,
incite_item_factory,
delete_df_collection,
- user_factory,
- product,
- offset,
- duration,
+ user_factory: Callable[..., User],
+ product: Product,
+ offset: str,
+ duration: timedelta,
mnt_filepath,
):
from generalresearch.models.thl.user import User