diff options
Diffstat (limited to 'generalresearch/models/thl/profiling/upk_question.py')
| -rw-r--r-- | generalresearch/models/thl/profiling/upk_question.py | 683 |
1 files changed, 683 insertions, 0 deletions
diff --git a/generalresearch/models/thl/profiling/upk_question.py b/generalresearch/models/thl/profiling/upk_question.py new file mode 100644 index 0000000..2b952ec --- /dev/null +++ b/generalresearch/models/thl/profiling/upk_question.py @@ -0,0 +1,683 @@ +from __future__ import annotations + +import hashlib +import json +import re +from enum import Enum +from functools import cached_property +from typing import List, Optional, Union, Literal, Dict, Tuple, Set + +from pydantic import ( + BaseModel, + Field, + model_validator, + field_validator, + ConfigDict, + NonNegativeInt, + PositiveInt, +) +from typing_extensions import Annotated + +from generalresearch.models import Source +from generalresearch.models.custom_types import UUIDStr +from generalresearch.models.thl.category import Category + + +class UPKImportance(BaseModel): + task_count: Optional[int] = Field( + ge=0, + default=None, + examples=[47], + description="The number of live Tasks that use this UPK Question", + ) + + task_score: Optional[float] = Field( + ge=0, + default=None, + examples=[0.11175522477414712], + description="GRL's internal ranked score for the UPK Question", + ) + + marketplace_task_count: Optional[Dict[Source, NonNegativeInt]] = Field( + default=None, + examples=[{Source.DYNATA: 23, Source.SPECTRUM: 24}], + description="The number of live Tasks that use this UPK Question per marketplace", + ) + + +class PatternValidation(BaseModel): + model_config = ConfigDict(frozen=True) + + message: str = Field(description="Message to display if validation fails") + + pattern: str = Field( + description="Regex string to validate. min_length and max_length are " + "checked separately, even if they are part of the regex." + ) + + +class UpkQuestionChoice(BaseModel): + model_config = ConfigDict(frozen=False, populate_by_name=True) + + # The choice ID uses the marketplace's code. This needs to be >32 for pollfish + id: str = Field( + min_length=1, + max_length=64, + pattern=r"^[\w\s\.\-]+$", + description="The unique identifier for a response to a qualification", + serialization_alias="choice_id", + validation_alias="choice_id", + frozen=True, + ) + + text: str = Field( + min_length=1, + description="The response text shown to respondents", + alias="choice_text", + frozen=True, + ) + + order: NonNegativeInt = Field() + + # Allows you to group answer choices together (used for display or extra logic) + group: Optional[int] = Field(default=None) + + exclusive: bool = Field( + default=False, + description="If answer is exclusive, it can be the only option selected", + ) + + importance: Optional[UPKImportance] = Field(default=None) + + def __hash__(self): + # We don't know the question ID!! Unique within a question only! + return hash(self.id) + + +class UpkQuestionChoiceOut(UpkQuestionChoice): + pass + # importance: Optional[UPKImportance] = Field(default=None, exclude=True) + + +class UpkQuestionType(str, Enum): + # The question has options that the user must select from. A MC question + # can be e.g. Selector.SINGLE_ANSWER or Selector.MULTIPLE_ANSWER to + # indicate only 1 or more than 1 option can be selected respectively. + MULTIPLE_CHOICE = "MC" + # The question has no options; the user must enter text. + TEXT_ENTRY = "TE" + # The question presents a slider of possible values, typically a numerical range. + SLIDER = "SLIDER" + # The question has no UI elements. + HIDDEN = "HIDDEN" + + +class UpkQuestionSelector(str, Enum): + pass + + +class UpkQuestionSelectorMC(UpkQuestionSelector): + SINGLE_ANSWER = "SA" + MULTIPLE_ANSWER = "MA" + DROPDOWN_LIST = "DL" + SELECT_BOX = "SB" + MULTI_SELECT_BOX = "MSB" + + +class UpkQuestionSelectorTE(UpkQuestionSelector): + SINGLE_LINE = "SL" + MULTI_LINE = "ML" + ESSAY_TEXT_BOX = "ETB" + + +class UpkQuestionSelectorSLIDER(UpkQuestionSelector): + HORIZONTAL_SLIDER = "HSLIDER" + VERTICAL_SLIDER = "VSLIDER" + + +class UpkQuestionSelectorHIDDEN(UpkQuestionSelector): + HIDDEN = "HIDDEN" + + +class UpkQuestionConfigurationMC(BaseModel): + model_config = ConfigDict(frozen=True, extra="forbid") + + # --- UpkQuestionType.MULTIPLE_CHOICE Options --- + # A multiple choice question with MA may allow a limited number of options + # to be selected. + # If the selector is SA, this should be set to 1. If the selector is MA, + # then this must be <= len(choices). + type: Literal[UpkQuestionType.MULTIPLE_CHOICE] = Field( + exclude=True, default=UpkQuestionType.MULTIPLE_CHOICE + ) + + max_select: Optional[int] = Field(gt=0, default=None) + + +class UpkQuestionConfigurationTE(BaseModel): + model_config = ConfigDict(frozen=True, extra="forbid") + + # --- UpkQuestionType.TEXT_ENTRY Options --- + type: Literal[UpkQuestionType.TEXT_ENTRY] = Field( + exclude=True, default=UpkQuestionType.TEXT_ENTRY + ) + + # Sets input form attribute; not the same as regex validation + max_length: Optional[PositiveInt] = Field( + default=None, + description="Maximum str length of any input. Meant as an easy, non" + "regex based check.", + ) + + # The text input box must contain this number of chars before submission + # is allowed + min_length: Optional[PositiveInt] = Field( + default=None, + description="Minimum str length of any input. Meant as an easy, non" + "regex based check.", + ) + + @model_validator(mode="after") + def check_options_agreement(self): + if self.max_length is not None and self.min_length is not None: + assert ( + self.min_length <= self.max_length + ), "max_length must be >= min_length" + return self + + +class UpkQuestionConfigurationSLIDER(BaseModel): + model_config = ConfigDict(frozen=True) + + # --- UpkQuestionType.SLIDER Options --- + type: Literal[UpkQuestionType.SLIDER] = Field( + exclude=True, default=UpkQuestionType.SLIDER + ) + + # TODO: constraints. we don't have any of these so not wasting time on this + slider_min: Optional[float] = Field(default=None) + slider_max: Optional[float] = Field(default=None) + slider_start: Optional[float] = Field(default=None) + slider_step: Optional[float] = Field(default=None) + + +class UpkQuestionValidation(BaseModel): + model_config = ConfigDict(frozen=True) + + # --- UpkQuestionType.TEXT_ENTRY Options --- + patterns: Optional[List[PatternValidation]] = Field(min_length=1) + + +SelectorType = Union[ + UpkQuestionSelectorMC, + UpkQuestionSelectorTE, + UpkQuestionSelectorSLIDER, + UpkQuestionSelectorHIDDEN, +] +Configuration = Annotated[ + Union[ + UpkQuestionConfigurationMC, + UpkQuestionConfigurationTE, + UpkQuestionConfigurationSLIDER, + ], + Field(discriminator="type"), +] + +example_upk_question = { + "choices": [ + { + "order": 0, + "choice_id": "1", + "exclusive": False, + "choice_text": "Yes", + }, + {"order": 1, "choice_id": "2", "exclusive": False, "choice_text": "No"}, + ], + "selector": "SA", + "task_count": 49, + "task_score": 3.3401743283265684, + "marketplace_task_count": { + "d": 9, + "w": 20, + "s": 20, + }, + "country_iso": "us", + "question_id": "fb20fd4773304500b39c4f6de0012a5a", + "language_iso": "eng", + "question_text": "Are you registered to vote at your present address, or not?", + "question_type": "MC", + "importance": UPKImportance( + task_count=49, + task_score=3.3401743283265684, + marketplace_task_count={ + Source.DYNATA: 9, + Source.WXET: 20, + Source.SPECTRUM: 20, + }, + ).model_dump(mode="json"), + "categories": [ + Category( + uuid="87b6d819f3ca4815bf1f135b1e829cc6", + adwords_vertical_id="396", + label="Politics", + path="/News/Politics", + parent_uuid="f66dddba61424ce5be2a38731450a0e1", + ).model_dump() + ], +} + + +class UpkQuestion(BaseModel): + model_config = ConfigDict( + populate_by_name=True, + json_schema_extra={"example": example_upk_question}, + # Don't set this to True. Breaks in model validator (infinite recursion) + validate_assignment=False, + ) + + # The id is globally unique + id: Optional[UUIDStr] = Field(default=None, alias="question_id") + + # The format is "{Source}:{question_id}" where Source is 1 or 2 chars, and + # question_id is the marketplace's ID for this question. + ext_question_id: Optional[str] = Field( + default=None, + description="what marketplace question this question links to", + pattern=r"^[a-z]{1,2}\:.*", + ) + + type: UpkQuestionType = Field(alias="question_type") + + # ISO 3166-1 alpha-2 (two-letter codes, lowercase) + country_iso: str = Field(max_length=2, min_length=2, pattern=r"^[a-z]{2}$") + # 3-char ISO 639-2/B, lowercase + language_iso: str = Field(max_length=3, min_length=3, pattern=r"^[a-z]{3}$") + + text: str = Field( + min_length=1, + description="The text shown to respondents", + alias="question_text", + ) + + # Don't set a min_length=1 here. We'll allow this to be created, but it + # won't be askable with empty choices. + choices: Optional[List[UpkQuestionChoice]] = Field(default=None) + selector: SelectorType = Field(default=None) + configuration: Optional[Configuration] = Field(default=None) + validation: Optional[UpkQuestionValidation] = Field(default=None) + importance: Optional[UPKImportance] = Field(default=None) + + categories: List[Category] = Field( + default_factory=list, + description="Categories associated with this question", + ) + + explanation_template: Optional[str] = Field( + description="Human-readable template for explaining how a user's answer to this question affects eligibility", + examples=[ + "The company that administers your employer's retirement plan is {answer}." + ], + default=None, + ) + explanation_fragment_template: Optional[str] = Field( + description="A very short, natural-language explanation fragment that can be combined with others into a single sentence", + examples=["whose employer's retirement plan is {answer}"], + default=None, + ) + + @property + def _key(self): + if self.id is None: + raise ValueError("must set .id first") + return self.id, self.country_iso, self.language_iso + + @property + def locale(self) -> str: + return self.country_iso + "_" + self.language_iso + + @property + def source(self) -> Optional[Source]: + if self.ext_question_id: + return Source(self.ext_question_id.split(":", 1)[0]) + + @cached_property + def choices_text_lookup(self): + if self.choices is None: + return None + return {c.id: c.text for c in self.choices} + + @model_validator(mode="before") + @classmethod + def check_configuration_type(cls, data: Dict): + # The model knows what the type of Configuration to grab depending on + # the key 'type' which it expects inside the configuration object. + # Here, we grab the type from the top-level model instead. + config = data.get("configuration") + if isinstance(config, dict) and config.get("type") is None: + data.setdefault("configuration", {}) + data["configuration"]["type"] = data.get("type") or data.get( + "question_type" + ) + return data + + @model_validator(mode="after") + def check_type_options_agreement(self): + # If type == "text_entry", options is None. Otherwise, must be set. + if self.type in {UpkQuestionType.TEXT_ENTRY, UpkQuestionType.HIDDEN}: + if isinstance(self.choices, list) and len(self.choices) == 0: + self.choices = None + assert ( + self.choices is None + ), f"No `choices` are allowed for type `{self.type}`" + else: + assert self.choices is not None, f"`choices` must be set" + return self + + @model_validator(mode="after") + def set_default_selector(self): + if self.selector is None: + if self.type == UpkQuestionType.MULTIPLE_CHOICE: + self.selector = UpkQuestionSelectorMC.SINGLE_ANSWER + elif self.type == UpkQuestionType.TEXT_ENTRY: + self.selector = UpkQuestionSelectorTE.SINGLE_LINE + elif self.type == UpkQuestionType.SLIDER: + self.selector = UpkQuestionSelectorSLIDER.HORIZONTAL_SLIDER + else: + self.selector = UpkQuestionSelectorHIDDEN.HIDDEN + return self + + @model_validator(mode="after") + def check_type_selector_agreement(self): + if self.type == UpkQuestionType.MULTIPLE_CHOICE: + assert isinstance( + self.selector, UpkQuestionSelectorMC + ), f"type `{self.type}` must have selector UpkQuestionSelectorMC" + if self.type == UpkQuestionType.TEXT_ENTRY: + assert isinstance( + self.selector, UpkQuestionSelectorTE + ), f"type `{self.type}` must have selector UpkQuestionSelectorTE" + if self.type == UpkQuestionType.SLIDER: + assert isinstance( + self.selector, UpkQuestionSelectorTE + ), f"type `{self.type}` must have selector UpkQuestionSelectorTE" + if self.type == UpkQuestionType.HIDDEN: + assert isinstance( + self.selector, UpkQuestionSelectorHIDDEN + ), f"type `{self.type}` must have selector UpkQuestionSelectorTE" + return self + + @model_validator(mode="after") + def check_type_validator_agreement(self): + if self.validation and self.validation.patterns is not None: + assert ( + self.type == UpkQuestionType.TEXT_ENTRY + ), "validation.patterns is only allowed on Text Entry Questions" + return self + + @model_validator(mode="after") + def check_config_choices(self): + if self.type == UpkQuestionType.MULTIPLE_CHOICE and self.configuration: + if self.selector in { + UpkQuestionSelectorMC.SINGLE_ANSWER, + UpkQuestionSelectorMC.DROPDOWN_LIST, + UpkQuestionSelectorMC.SELECT_BOX, + }: + assert ( + self.configuration.max_select == 1 + ), f"configuration.max_select must be 1 if the selector is {self.selector.value}" + else: + assert self.configuration.max_select <= len( + self.choices + ), "configuration.max_select must be >= len(choices)" + return self + + @field_validator("choices") + @classmethod + def order_choices(cls, choices): + if choices: + choices.sort(key=lambda x: x.order) + return choices + + @field_validator("choices") + @classmethod + def validate_choices(cls, choices): + if choices: + ids = {x.id for x in choices} + assert len(ids) == len(choices), "choices.id must be unique" + orders = {x.order for x in choices} + assert len(orders) == len(choices), "choices.order must be unique" + return choices + + @field_validator("explanation_template", "explanation_fragment_template") + @classmethod + def validate_explanation_template(cls, v): + if v is None: + return v + if "{answer}" not in v: + raise ValueError("field must include '{answer}'") + return v + + @property + def is_askable(self) -> bool: + if len(self.text) < 5: + # It should have some text that is question-like. 5 is chosen + # because it is the shortest known "real" question (spectrum + # gender = "I'm a") + return False + + if len(self.text) > 1024: + # This usually means it is some sort of ridiculous terms & + # conditions they want the user to agree to, which we don't want + # to support + return False + + # Almost nothing has >1k options, besides location stuff (cities, + # etc.) which should get harmonized. When presenting them, we'll + # filter down options to at most 50. + if self.choices and (len(self.choices) <= 1 or len(self.choices) > 1000): + return False + + return True + + @property + def md5sum(self): + # Used to determine if a question has changed + d = { + "question_text": self.text, + "question_type": self.type.value, + "selector": self.selector.value, + "choices": ( + [{"choice_id": x.id, "choice_text": x.text} for x in self.choices] + if self.choices + else [] + ), + } + return hashlib.md5(json.dumps(d, sort_keys=True).encode("utf-8")).hexdigest() + + def to_api_format(self): + d = self.model_dump(mode="json", exclude_none=True, by_alias=True) + # This doesn't currently get included, I think it could but not sure + # if it would break anything + d.pop("ext_question_id", None) + # API expects task_score and task_count on the top-level + d.update(d.pop("importance", {})) + return d + + def validate_question_answer(self, answer: Tuple[str, ...]) -> Tuple[bool, str]: + """ + Returns (is_valid, error_message). + """ + try: + self._validate_question_answer(answer) + except AssertionError as e: + return False, str(e) + else: + return True, "" + + def _validate_question_answer(self, answer: Tuple[str, ...]) -> None: + """ + If the question is MC, validate: + - validate selector SA vs MA (1 selected vs >1 selected) + - the answers match actual codes in the choices + - validate configuration.max_select + - validate choices.exclusive + If the question is TE, validate that: + - configuration.max_length + - validation.patterns + Throws AssertionError if the answer is invalid, otherwise returns None + """ + answer = tuple(answer) + # There should never be multiple of the same value + assert sorted(set(answer)) == sorted( + answer + ), "Multiple of the same answer submitted" + if self.type == UpkQuestionType.MULTIPLE_CHOICE: + assert len(answer) >= 1, "MC question with no selected answers" + choice_codes = set(x.id for x in self.choices) + if self.selector == UpkQuestionSelectorMC.SINGLE_ANSWER: + assert ( + len(answer) == 1 + ), "Single Answer MC question with >1 selected answers" + elif self.selector == UpkQuestionSelectorMC.MULTIPLE_ANSWER: + assert len(answer) <= len( + self.choices + ), "More options selected than allowed" + assert all( + ans in choice_codes for ans in answer + ), "Invalid Options Selected" + max_select = ( + self.configuration.max_select + if self.configuration + else 0 or len(self.choices) + ) + assert len(answer) <= max_select, "More options selected than allowed" + exclusive_choice = next((x for x in self.choices if x.exclusive), None) + if exclusive_choice: + exclusive_choice_id = exclusive_choice.id + assert ( + answer == (exclusive_choice_id,) + or exclusive_choice_id not in answer + ), "Invalid exclusive selection" + elif self.type == UpkQuestionType.TEXT_ENTRY: + assert len(answer) == 1, "Only one answer allowed" + answer = answer[0] + assert len(answer) > 0, "Must provide answer" + max_length = ( + self.configuration.max_length if self.configuration else 0 or 100000 + ) + assert len(answer) <= max_length, "Answer longer than allowed" + if self.validation and self.validation.patterns: + for pattern in self.validation.patterns: + assert re.search(pattern.pattern, answer), pattern.message + elif self.type == UpkQuestionType.HIDDEN: + pass + + +class UpkQuestionOut(UpkQuestion): + choices: Optional[List[UpkQuestionChoiceOut]] = Field(default=None) + # Return both importance top-level model and extracted keys for now. + # Eventually deprecate one way. + task_count: Optional[int] = Field( + ge=0, + default=None, + examples=[47], + description="The number of live Tasks that use this UPK Question", + ) + + task_score: Optional[float] = Field( + ge=0, + default=None, + examples=[0.11175522477414712], + description="GRL's internal ranked score for the UPK Question", + ) + + marketplace_task_count: Optional[Dict[Source, NonNegativeInt]] = Field( + default=None, + examples=[{Source.DYNATA: 23, Source.SPECTRUM: 24}], + description="The number of live Tasks that use this UPK Question per marketplace", + ) + + @model_validator(mode="after") + def populate_from_importance(self): + # When we return through the api, bring the importance keys to the top-level + if self.importance: + self.task_count = self.importance.task_count + self.task_score = self.importance.task_score + self.marketplace_task_count = self.importance.marketplace_task_count + return self + + +def order_exclusive_options(q: UpkQuestion): + """ + The idea is to call then when doing a MP -> UPK conversion, where the + marketplace doesn't have the order specified. + """ + from generalresearch.models.thl.profiling.other_option import ( + option_is_catch_all, + ) + + if q.choices: + last_choices = [c for c in q.choices if option_is_catch_all(c)] + for c in last_choices: + q.choices.remove(c) + q.choices.append(c) + c.exclusive = True + if last_choices: + for idx, c in enumerate(q.choices): + c.order = idx + + +def trim_options(q: UpkQuestion, max_options: int = 50) -> UpkQuestion: + """Filter weighted MC/SC Options during Offerwall Requests or Refresh + + - Remove any of ZERO importance + - ~50 option HARD limit, keep only the 50 highest scoring + - In soft-pair, take up to requested, or 50 + - Implement N-1 to keep options that are a catch-all / exclusive. + """ + from generalresearch.models.thl.profiling.other_option import ( + option_is_catch_all, + ) + + q = q.model_copy() + if not q.choices: + return q + if q.ext_question_id.startswith("gr:") or q.ext_question_id.startswith("g:"): + return q + + special_choices: Set[UpkQuestionChoice] = { + c for c in q.choices if option_is_catch_all(c) + } + + if q.choices[0].importance is None: + # We're calculating UpkQuestionChoice important on (1) UpkQuestionChoice + # Creation and (2) every 60min, so this should always be set. However, + # if isn't for some reason, don't fail... just show a random set of + # 50 UpkQuestionChoices. Sorry ¯\_(ツ)_/¯ + for c in q.choices: + c.importance = UPKImportance(task_score=1, task_count=1) + + possible_choices = [ + c for c in q.choices if c.importance.task_count > 0 or c in special_choices + ] + if possible_choices: + q.choices = possible_choices + else: + # We can't have a MC question with all choices filtered out. + pass + + if len(q.choices) > max_options: + choices = q.choices + # If there is a Special Choice (eg: "none of the above", "decline to + # answer", "prefer not to say", etc) always include it at the bottom. + idx = max_options - len(special_choices) + choices = set( + sorted(choices, key=lambda x: x.importance.task_score, reverse=True)[:idx] + ) + choices.update(special_choices) + q.choices = sorted(choices, key=lambda x: x.order) + + return q + + +UpkQuestionOut.model_rebuild() |
