aboutsummaryrefslogtreecommitdiff
path: root/generalresearch/models/thl/profiling/upk_question.py
diff options
context:
space:
mode:
Diffstat (limited to 'generalresearch/models/thl/profiling/upk_question.py')
-rw-r--r--generalresearch/models/thl/profiling/upk_question.py683
1 files changed, 683 insertions, 0 deletions
diff --git a/generalresearch/models/thl/profiling/upk_question.py b/generalresearch/models/thl/profiling/upk_question.py
new file mode 100644
index 0000000..2b952ec
--- /dev/null
+++ b/generalresearch/models/thl/profiling/upk_question.py
@@ -0,0 +1,683 @@
+from __future__ import annotations
+
+import hashlib
+import json
+import re
+from enum import Enum
+from functools import cached_property
+from typing import List, Optional, Union, Literal, Dict, Tuple, Set
+
+from pydantic import (
+ BaseModel,
+ Field,
+ model_validator,
+ field_validator,
+ ConfigDict,
+ NonNegativeInt,
+ PositiveInt,
+)
+from typing_extensions import Annotated
+
+from generalresearch.models import Source
+from generalresearch.models.custom_types import UUIDStr
+from generalresearch.models.thl.category import Category
+
+
+class UPKImportance(BaseModel):
+ task_count: Optional[int] = Field(
+ ge=0,
+ default=None,
+ examples=[47],
+ description="The number of live Tasks that use this UPK Question",
+ )
+
+ task_score: Optional[float] = Field(
+ ge=0,
+ default=None,
+ examples=[0.11175522477414712],
+ description="GRL's internal ranked score for the UPK Question",
+ )
+
+ marketplace_task_count: Optional[Dict[Source, NonNegativeInt]] = Field(
+ default=None,
+ examples=[{Source.DYNATA: 23, Source.SPECTRUM: 24}],
+ description="The number of live Tasks that use this UPK Question per marketplace",
+ )
+
+
+class PatternValidation(BaseModel):
+ model_config = ConfigDict(frozen=True)
+
+ message: str = Field(description="Message to display if validation fails")
+
+ pattern: str = Field(
+ description="Regex string to validate. min_length and max_length are "
+ "checked separately, even if they are part of the regex."
+ )
+
+
+class UpkQuestionChoice(BaseModel):
+ model_config = ConfigDict(frozen=False, populate_by_name=True)
+
+ # The choice ID uses the marketplace's code. This needs to be >32 for pollfish
+ id: str = Field(
+ min_length=1,
+ max_length=64,
+ pattern=r"^[\w\s\.\-]+$",
+ description="The unique identifier for a response to a qualification",
+ serialization_alias="choice_id",
+ validation_alias="choice_id",
+ frozen=True,
+ )
+
+ text: str = Field(
+ min_length=1,
+ description="The response text shown to respondents",
+ alias="choice_text",
+ frozen=True,
+ )
+
+ order: NonNegativeInt = Field()
+
+ # Allows you to group answer choices together (used for display or extra logic)
+ group: Optional[int] = Field(default=None)
+
+ exclusive: bool = Field(
+ default=False,
+ description="If answer is exclusive, it can be the only option selected",
+ )
+
+ importance: Optional[UPKImportance] = Field(default=None)
+
+ def __hash__(self):
+ # We don't know the question ID!! Unique within a question only!
+ return hash(self.id)
+
+
+class UpkQuestionChoiceOut(UpkQuestionChoice):
+ pass
+ # importance: Optional[UPKImportance] = Field(default=None, exclude=True)
+
+
+class UpkQuestionType(str, Enum):
+ # The question has options that the user must select from. A MC question
+ # can be e.g. Selector.SINGLE_ANSWER or Selector.MULTIPLE_ANSWER to
+ # indicate only 1 or more than 1 option can be selected respectively.
+ MULTIPLE_CHOICE = "MC"
+ # The question has no options; the user must enter text.
+ TEXT_ENTRY = "TE"
+ # The question presents a slider of possible values, typically a numerical range.
+ SLIDER = "SLIDER"
+ # The question has no UI elements.
+ HIDDEN = "HIDDEN"
+
+
+class UpkQuestionSelector(str, Enum):
+ pass
+
+
+class UpkQuestionSelectorMC(UpkQuestionSelector):
+ SINGLE_ANSWER = "SA"
+ MULTIPLE_ANSWER = "MA"
+ DROPDOWN_LIST = "DL"
+ SELECT_BOX = "SB"
+ MULTI_SELECT_BOX = "MSB"
+
+
+class UpkQuestionSelectorTE(UpkQuestionSelector):
+ SINGLE_LINE = "SL"
+ MULTI_LINE = "ML"
+ ESSAY_TEXT_BOX = "ETB"
+
+
+class UpkQuestionSelectorSLIDER(UpkQuestionSelector):
+ HORIZONTAL_SLIDER = "HSLIDER"
+ VERTICAL_SLIDER = "VSLIDER"
+
+
+class UpkQuestionSelectorHIDDEN(UpkQuestionSelector):
+ HIDDEN = "HIDDEN"
+
+
+class UpkQuestionConfigurationMC(BaseModel):
+ model_config = ConfigDict(frozen=True, extra="forbid")
+
+ # --- UpkQuestionType.MULTIPLE_CHOICE Options ---
+ # A multiple choice question with MA may allow a limited number of options
+ # to be selected.
+ # If the selector is SA, this should be set to 1. If the selector is MA,
+ # then this must be <= len(choices).
+ type: Literal[UpkQuestionType.MULTIPLE_CHOICE] = Field(
+ exclude=True, default=UpkQuestionType.MULTIPLE_CHOICE
+ )
+
+ max_select: Optional[int] = Field(gt=0, default=None)
+
+
+class UpkQuestionConfigurationTE(BaseModel):
+ model_config = ConfigDict(frozen=True, extra="forbid")
+
+ # --- UpkQuestionType.TEXT_ENTRY Options ---
+ type: Literal[UpkQuestionType.TEXT_ENTRY] = Field(
+ exclude=True, default=UpkQuestionType.TEXT_ENTRY
+ )
+
+ # Sets input form attribute; not the same as regex validation
+ max_length: Optional[PositiveInt] = Field(
+ default=None,
+ description="Maximum str length of any input. Meant as an easy, non"
+ "regex based check.",
+ )
+
+ # The text input box must contain this number of chars before submission
+ # is allowed
+ min_length: Optional[PositiveInt] = Field(
+ default=None,
+ description="Minimum str length of any input. Meant as an easy, non"
+ "regex based check.",
+ )
+
+ @model_validator(mode="after")
+ def check_options_agreement(self):
+ if self.max_length is not None and self.min_length is not None:
+ assert (
+ self.min_length <= self.max_length
+ ), "max_length must be >= min_length"
+ return self
+
+
+class UpkQuestionConfigurationSLIDER(BaseModel):
+ model_config = ConfigDict(frozen=True)
+
+ # --- UpkQuestionType.SLIDER Options ---
+ type: Literal[UpkQuestionType.SLIDER] = Field(
+ exclude=True, default=UpkQuestionType.SLIDER
+ )
+
+ # TODO: constraints. we don't have any of these so not wasting time on this
+ slider_min: Optional[float] = Field(default=None)
+ slider_max: Optional[float] = Field(default=None)
+ slider_start: Optional[float] = Field(default=None)
+ slider_step: Optional[float] = Field(default=None)
+
+
+class UpkQuestionValidation(BaseModel):
+ model_config = ConfigDict(frozen=True)
+
+ # --- UpkQuestionType.TEXT_ENTRY Options ---
+ patterns: Optional[List[PatternValidation]] = Field(min_length=1)
+
+
+SelectorType = Union[
+ UpkQuestionSelectorMC,
+ UpkQuestionSelectorTE,
+ UpkQuestionSelectorSLIDER,
+ UpkQuestionSelectorHIDDEN,
+]
+Configuration = Annotated[
+ Union[
+ UpkQuestionConfigurationMC,
+ UpkQuestionConfigurationTE,
+ UpkQuestionConfigurationSLIDER,
+ ],
+ Field(discriminator="type"),
+]
+
+example_upk_question = {
+ "choices": [
+ {
+ "order": 0,
+ "choice_id": "1",
+ "exclusive": False,
+ "choice_text": "Yes",
+ },
+ {"order": 1, "choice_id": "2", "exclusive": False, "choice_text": "No"},
+ ],
+ "selector": "SA",
+ "task_count": 49,
+ "task_score": 3.3401743283265684,
+ "marketplace_task_count": {
+ "d": 9,
+ "w": 20,
+ "s": 20,
+ },
+ "country_iso": "us",
+ "question_id": "fb20fd4773304500b39c4f6de0012a5a",
+ "language_iso": "eng",
+ "question_text": "Are you registered to vote at your present address, or not?",
+ "question_type": "MC",
+ "importance": UPKImportance(
+ task_count=49,
+ task_score=3.3401743283265684,
+ marketplace_task_count={
+ Source.DYNATA: 9,
+ Source.WXET: 20,
+ Source.SPECTRUM: 20,
+ },
+ ).model_dump(mode="json"),
+ "categories": [
+ Category(
+ uuid="87b6d819f3ca4815bf1f135b1e829cc6",
+ adwords_vertical_id="396",
+ label="Politics",
+ path="/News/Politics",
+ parent_uuid="f66dddba61424ce5be2a38731450a0e1",
+ ).model_dump()
+ ],
+}
+
+
+class UpkQuestion(BaseModel):
+ model_config = ConfigDict(
+ populate_by_name=True,
+ json_schema_extra={"example": example_upk_question},
+ # Don't set this to True. Breaks in model validator (infinite recursion)
+ validate_assignment=False,
+ )
+
+ # The id is globally unique
+ id: Optional[UUIDStr] = Field(default=None, alias="question_id")
+
+ # The format is "{Source}:{question_id}" where Source is 1 or 2 chars, and
+ # question_id is the marketplace's ID for this question.
+ ext_question_id: Optional[str] = Field(
+ default=None,
+ description="what marketplace question this question links to",
+ pattern=r"^[a-z]{1,2}\:.*",
+ )
+
+ type: UpkQuestionType = Field(alias="question_type")
+
+ # ISO 3166-1 alpha-2 (two-letter codes, lowercase)
+ country_iso: str = Field(max_length=2, min_length=2, pattern=r"^[a-z]{2}$")
+ # 3-char ISO 639-2/B, lowercase
+ language_iso: str = Field(max_length=3, min_length=3, pattern=r"^[a-z]{3}$")
+
+ text: str = Field(
+ min_length=1,
+ description="The text shown to respondents",
+ alias="question_text",
+ )
+
+ # Don't set a min_length=1 here. We'll allow this to be created, but it
+ # won't be askable with empty choices.
+ choices: Optional[List[UpkQuestionChoice]] = Field(default=None)
+ selector: SelectorType = Field(default=None)
+ configuration: Optional[Configuration] = Field(default=None)
+ validation: Optional[UpkQuestionValidation] = Field(default=None)
+ importance: Optional[UPKImportance] = Field(default=None)
+
+ categories: List[Category] = Field(
+ default_factory=list,
+ description="Categories associated with this question",
+ )
+
+ explanation_template: Optional[str] = Field(
+ description="Human-readable template for explaining how a user's answer to this question affects eligibility",
+ examples=[
+ "The company that administers your employer's retirement plan is {answer}."
+ ],
+ default=None,
+ )
+ explanation_fragment_template: Optional[str] = Field(
+ description="A very short, natural-language explanation fragment that can be combined with others into a single sentence",
+ examples=["whose employer's retirement plan is {answer}"],
+ default=None,
+ )
+
+ @property
+ def _key(self):
+ if self.id is None:
+ raise ValueError("must set .id first")
+ return self.id, self.country_iso, self.language_iso
+
+ @property
+ def locale(self) -> str:
+ return self.country_iso + "_" + self.language_iso
+
+ @property
+ def source(self) -> Optional[Source]:
+ if self.ext_question_id:
+ return Source(self.ext_question_id.split(":", 1)[0])
+
+ @cached_property
+ def choices_text_lookup(self):
+ if self.choices is None:
+ return None
+ return {c.id: c.text for c in self.choices}
+
+ @model_validator(mode="before")
+ @classmethod
+ def check_configuration_type(cls, data: Dict):
+ # The model knows what the type of Configuration to grab depending on
+ # the key 'type' which it expects inside the configuration object.
+ # Here, we grab the type from the top-level model instead.
+ config = data.get("configuration")
+ if isinstance(config, dict) and config.get("type") is None:
+ data.setdefault("configuration", {})
+ data["configuration"]["type"] = data.get("type") or data.get(
+ "question_type"
+ )
+ return data
+
+ @model_validator(mode="after")
+ def check_type_options_agreement(self):
+ # If type == "text_entry", options is None. Otherwise, must be set.
+ if self.type in {UpkQuestionType.TEXT_ENTRY, UpkQuestionType.HIDDEN}:
+ if isinstance(self.choices, list) and len(self.choices) == 0:
+ self.choices = None
+ assert (
+ self.choices is None
+ ), f"No `choices` are allowed for type `{self.type}`"
+ else:
+ assert self.choices is not None, f"`choices` must be set"
+ return self
+
+ @model_validator(mode="after")
+ def set_default_selector(self):
+ if self.selector is None:
+ if self.type == UpkQuestionType.MULTIPLE_CHOICE:
+ self.selector = UpkQuestionSelectorMC.SINGLE_ANSWER
+ elif self.type == UpkQuestionType.TEXT_ENTRY:
+ self.selector = UpkQuestionSelectorTE.SINGLE_LINE
+ elif self.type == UpkQuestionType.SLIDER:
+ self.selector = UpkQuestionSelectorSLIDER.HORIZONTAL_SLIDER
+ else:
+ self.selector = UpkQuestionSelectorHIDDEN.HIDDEN
+ return self
+
+ @model_validator(mode="after")
+ def check_type_selector_agreement(self):
+ if self.type == UpkQuestionType.MULTIPLE_CHOICE:
+ assert isinstance(
+ self.selector, UpkQuestionSelectorMC
+ ), f"type `{self.type}` must have selector UpkQuestionSelectorMC"
+ if self.type == UpkQuestionType.TEXT_ENTRY:
+ assert isinstance(
+ self.selector, UpkQuestionSelectorTE
+ ), f"type `{self.type}` must have selector UpkQuestionSelectorTE"
+ if self.type == UpkQuestionType.SLIDER:
+ assert isinstance(
+ self.selector, UpkQuestionSelectorTE
+ ), f"type `{self.type}` must have selector UpkQuestionSelectorTE"
+ if self.type == UpkQuestionType.HIDDEN:
+ assert isinstance(
+ self.selector, UpkQuestionSelectorHIDDEN
+ ), f"type `{self.type}` must have selector UpkQuestionSelectorTE"
+ return self
+
+ @model_validator(mode="after")
+ def check_type_validator_agreement(self):
+ if self.validation and self.validation.patterns is not None:
+ assert (
+ self.type == UpkQuestionType.TEXT_ENTRY
+ ), "validation.patterns is only allowed on Text Entry Questions"
+ return self
+
+ @model_validator(mode="after")
+ def check_config_choices(self):
+ if self.type == UpkQuestionType.MULTIPLE_CHOICE and self.configuration:
+ if self.selector in {
+ UpkQuestionSelectorMC.SINGLE_ANSWER,
+ UpkQuestionSelectorMC.DROPDOWN_LIST,
+ UpkQuestionSelectorMC.SELECT_BOX,
+ }:
+ assert (
+ self.configuration.max_select == 1
+ ), f"configuration.max_select must be 1 if the selector is {self.selector.value}"
+ else:
+ assert self.configuration.max_select <= len(
+ self.choices
+ ), "configuration.max_select must be >= len(choices)"
+ return self
+
+ @field_validator("choices")
+ @classmethod
+ def order_choices(cls, choices):
+ if choices:
+ choices.sort(key=lambda x: x.order)
+ return choices
+
+ @field_validator("choices")
+ @classmethod
+ def validate_choices(cls, choices):
+ if choices:
+ ids = {x.id for x in choices}
+ assert len(ids) == len(choices), "choices.id must be unique"
+ orders = {x.order for x in choices}
+ assert len(orders) == len(choices), "choices.order must be unique"
+ return choices
+
+ @field_validator("explanation_template", "explanation_fragment_template")
+ @classmethod
+ def validate_explanation_template(cls, v):
+ if v is None:
+ return v
+ if "{answer}" not in v:
+ raise ValueError("field must include '{answer}'")
+ return v
+
+ @property
+ def is_askable(self) -> bool:
+ if len(self.text) < 5:
+ # It should have some text that is question-like. 5 is chosen
+ # because it is the shortest known "real" question (spectrum
+ # gender = "I'm a")
+ return False
+
+ if len(self.text) > 1024:
+ # This usually means it is some sort of ridiculous terms &
+ # conditions they want the user to agree to, which we don't want
+ # to support
+ return False
+
+ # Almost nothing has >1k options, besides location stuff (cities,
+ # etc.) which should get harmonized. When presenting them, we'll
+ # filter down options to at most 50.
+ if self.choices and (len(self.choices) <= 1 or len(self.choices) > 1000):
+ return False
+
+ return True
+
+ @property
+ def md5sum(self):
+ # Used to determine if a question has changed
+ d = {
+ "question_text": self.text,
+ "question_type": self.type.value,
+ "selector": self.selector.value,
+ "choices": (
+ [{"choice_id": x.id, "choice_text": x.text} for x in self.choices]
+ if self.choices
+ else []
+ ),
+ }
+ return hashlib.md5(json.dumps(d, sort_keys=True).encode("utf-8")).hexdigest()
+
+ def to_api_format(self):
+ d = self.model_dump(mode="json", exclude_none=True, by_alias=True)
+ # This doesn't currently get included, I think it could but not sure
+ # if it would break anything
+ d.pop("ext_question_id", None)
+ # API expects task_score and task_count on the top-level
+ d.update(d.pop("importance", {}))
+ return d
+
+ def validate_question_answer(self, answer: Tuple[str, ...]) -> Tuple[bool, str]:
+ """
+ Returns (is_valid, error_message).
+ """
+ try:
+ self._validate_question_answer(answer)
+ except AssertionError as e:
+ return False, str(e)
+ else:
+ return True, ""
+
+ def _validate_question_answer(self, answer: Tuple[str, ...]) -> None:
+ """
+ If the question is MC, validate:
+ - validate selector SA vs MA (1 selected vs >1 selected)
+ - the answers match actual codes in the choices
+ - validate configuration.max_select
+ - validate choices.exclusive
+ If the question is TE, validate that:
+ - configuration.max_length
+ - validation.patterns
+ Throws AssertionError if the answer is invalid, otherwise returns None
+ """
+ answer = tuple(answer)
+ # There should never be multiple of the same value
+ assert sorted(set(answer)) == sorted(
+ answer
+ ), "Multiple of the same answer submitted"
+ if self.type == UpkQuestionType.MULTIPLE_CHOICE:
+ assert len(answer) >= 1, "MC question with no selected answers"
+ choice_codes = set(x.id for x in self.choices)
+ if self.selector == UpkQuestionSelectorMC.SINGLE_ANSWER:
+ assert (
+ len(answer) == 1
+ ), "Single Answer MC question with >1 selected answers"
+ elif self.selector == UpkQuestionSelectorMC.MULTIPLE_ANSWER:
+ assert len(answer) <= len(
+ self.choices
+ ), "More options selected than allowed"
+ assert all(
+ ans in choice_codes for ans in answer
+ ), "Invalid Options Selected"
+ max_select = (
+ self.configuration.max_select
+ if self.configuration
+ else 0 or len(self.choices)
+ )
+ assert len(answer) <= max_select, "More options selected than allowed"
+ exclusive_choice = next((x for x in self.choices if x.exclusive), None)
+ if exclusive_choice:
+ exclusive_choice_id = exclusive_choice.id
+ assert (
+ answer == (exclusive_choice_id,)
+ or exclusive_choice_id not in answer
+ ), "Invalid exclusive selection"
+ elif self.type == UpkQuestionType.TEXT_ENTRY:
+ assert len(answer) == 1, "Only one answer allowed"
+ answer = answer[0]
+ assert len(answer) > 0, "Must provide answer"
+ max_length = (
+ self.configuration.max_length if self.configuration else 0 or 100000
+ )
+ assert len(answer) <= max_length, "Answer longer than allowed"
+ if self.validation and self.validation.patterns:
+ for pattern in self.validation.patterns:
+ assert re.search(pattern.pattern, answer), pattern.message
+ elif self.type == UpkQuestionType.HIDDEN:
+ pass
+
+
+class UpkQuestionOut(UpkQuestion):
+ choices: Optional[List[UpkQuestionChoiceOut]] = Field(default=None)
+ # Return both importance top-level model and extracted keys for now.
+ # Eventually deprecate one way.
+ task_count: Optional[int] = Field(
+ ge=0,
+ default=None,
+ examples=[47],
+ description="The number of live Tasks that use this UPK Question",
+ )
+
+ task_score: Optional[float] = Field(
+ ge=0,
+ default=None,
+ examples=[0.11175522477414712],
+ description="GRL's internal ranked score for the UPK Question",
+ )
+
+ marketplace_task_count: Optional[Dict[Source, NonNegativeInt]] = Field(
+ default=None,
+ examples=[{Source.DYNATA: 23, Source.SPECTRUM: 24}],
+ description="The number of live Tasks that use this UPK Question per marketplace",
+ )
+
+ @model_validator(mode="after")
+ def populate_from_importance(self):
+ # When we return through the api, bring the importance keys to the top-level
+ if self.importance:
+ self.task_count = self.importance.task_count
+ self.task_score = self.importance.task_score
+ self.marketplace_task_count = self.importance.marketplace_task_count
+ return self
+
+
+def order_exclusive_options(q: UpkQuestion):
+ """
+ The idea is to call then when doing a MP -> UPK conversion, where the
+ marketplace doesn't have the order specified.
+ """
+ from generalresearch.models.thl.profiling.other_option import (
+ option_is_catch_all,
+ )
+
+ if q.choices:
+ last_choices = [c for c in q.choices if option_is_catch_all(c)]
+ for c in last_choices:
+ q.choices.remove(c)
+ q.choices.append(c)
+ c.exclusive = True
+ if last_choices:
+ for idx, c in enumerate(q.choices):
+ c.order = idx
+
+
+def trim_options(q: UpkQuestion, max_options: int = 50) -> UpkQuestion:
+ """Filter weighted MC/SC Options during Offerwall Requests or Refresh
+
+ - Remove any of ZERO importance
+ - ~50 option HARD limit, keep only the 50 highest scoring
+ - In soft-pair, take up to requested, or 50
+ - Implement N-1 to keep options that are a catch-all / exclusive.
+ """
+ from generalresearch.models.thl.profiling.other_option import (
+ option_is_catch_all,
+ )
+
+ q = q.model_copy()
+ if not q.choices:
+ return q
+ if q.ext_question_id.startswith("gr:") or q.ext_question_id.startswith("g:"):
+ return q
+
+ special_choices: Set[UpkQuestionChoice] = {
+ c for c in q.choices if option_is_catch_all(c)
+ }
+
+ if q.choices[0].importance is None:
+ # We're calculating UpkQuestionChoice important on (1) UpkQuestionChoice
+ # Creation and (2) every 60min, so this should always be set. However,
+ # if isn't for some reason, don't fail... just show a random set of
+ # 50 UpkQuestionChoices. Sorry ¯\_(ツ)_/¯
+ for c in q.choices:
+ c.importance = UPKImportance(task_score=1, task_count=1)
+
+ possible_choices = [
+ c for c in q.choices if c.importance.task_count > 0 or c in special_choices
+ ]
+ if possible_choices:
+ q.choices = possible_choices
+ else:
+ # We can't have a MC question with all choices filtered out.
+ pass
+
+ if len(q.choices) > max_options:
+ choices = q.choices
+ # If there is a Special Choice (eg: "none of the above", "decline to
+ # answer", "prefer not to say", etc) always include it at the bottom.
+ idx = max_options - len(special_choices)
+ choices = set(
+ sorted(choices, key=lambda x: x.importance.task_score, reverse=True)[:idx]
+ )
+ choices.update(special_choices)
+ q.choices = sorted(choices, key=lambda x: x.order)
+
+ return q
+
+
+UpkQuestionOut.model_rebuild()