From 62d4fb55725e544d60f80f7c0b632e01f2887e8b Mon Sep 17 00:00:00 2001 From: tbhat-ops Date: Fri, 10 Apr 2026 19:35:24 -0700 Subject: [PATCH] Add availability heuristic bias to CoBRA pipeline --- .gitignore | 3 +- control/api_prompt_experiment.py | 4 + control/prompt_experiment.py | 4 + ...vailability_fame_frequency_and_recall.json | 707 ++++++++++++++++++ ...ailability_judgment_of_word_frequency.json | 26 + examples/availability/utils_availability.py | 317 ++++++++ examples/unified_bias/run_pipelines.py | 2 +- examples/unified_bias/utils_bias.py | 55 +- generator/scenario_generator.py | 66 +- 9 files changed, 1174 insertions(+), 10 deletions(-) create mode 100644 data/availability/availability_fame_frequency_and_recall.json create mode 100644 data/availability/availability_judgment_of_word_frequency.json create mode 100644 examples/availability/utils_availability.py diff --git a/.gitignore b/.gitignore index 4fcc622..74c76a4 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,5 @@ download* *.eps data_generated/raw_responses/ build/ -lora_outputs/ \ No newline at end of file +lora_outputs/ +availability_plots/ \ No newline at end of file diff --git a/control/api_prompt_experiment.py b/control/api_prompt_experiment.py index 6e927e2..523f49c 100644 --- a/control/api_prompt_experiment.py +++ b/control/api_prompt_experiment.py @@ -197,6 +197,10 @@ def _generate_likert_control_prompt(self, level: int) -> str: 'confirmation': { 'max_description': 'always seek only information that confirms your existing beliefs', 'min_description': 'always seek information that challenges your existing beliefs' + }, + 'availability': { + 'max_description': 'always judge frequency and probability based on how easily examples come to mind rather than actual statistics', + 'min_description': 'never judge frequency based on ease of recall and always rely on objective data and actual statistics' } } diff --git a/control/prompt_experiment.py b/control/prompt_experiment.py index ea8ae32..57f9da5 100644 --- a/control/prompt_experiment.py +++ b/control/prompt_experiment.py @@ -81,6 +81,10 @@ def _generate_likert_control_prompt(self, level): 'confirmation': { 'max_description': 'always seek only information that confirms your existing beliefs', 'min_description': 'never seek only information that confirms your existing beliefs' + }, + 'availability': { + 'max_description': 'always judge frequency and probability based on how easily examples come to mind', + 'min_description': 'never judge frequency based on ease of recall and always rely on actual statistics' } } diff --git a/data/availability/availability_fame_frequency_and_recall.json b/data/availability/availability_fame_frequency_and_recall.json new file mode 100644 index 0000000..c829209 --- /dev/null +++ b/data/availability/availability_fame_frequency_and_recall.json @@ -0,0 +1,707 @@ +[ + { + "id": 0, + "domain": "entertainers", + "group_a_label": "famous women entertainers", + "group_b_label": "less famous men entertainers", + "names_list": [ + "Shakira", + "Marcus Hale", + "Olivia Rodrigo", + "Dylan Ross", + "Katy Perry", + "Lady Gaga", + "Darren Cole", + "Trevor Shaw", + "Ariana Grande", + "Connor Lane", + "Noah Grant", + "Billie Eilish", + "Owen Turner", + "Aiden Cross", + "Taylor Swift", + "Miley Cyrus", + "Luke Hayes", + "Logan Pierce", + "Beyoncé", + "Nathan Bell", + "Selena Gomez", + "Liam Foster", + "Madonna", + "Tyler Dean", + "Ethan Scott", + "Zendaya", + "Sabrina Carpenter", + "Evan Brooks", + "Jennifer Lopez", + "Dua Lipa", + "Caleb Price", + "Ryan Wells", + "Adele", + "Doja Cat", + "Jason Reed", + "Adam Morris", + "Cher", + "Rihanna", + "Blake Turner" + ] + }, + { + "id": 1, + "domain": "public figures", + "group_a_label": "famous women public figures", + "group_b_label": "less famous men public figures", + "names_list": [ + "Beyoncé", + "Adrian Wells", + "Adam Hayes", + "Nancy Pelosi", + "Ryan Moss", + "Emma Watson", + "Caleb Dean", + "Taylor Swift", + "Kamala Harris", + "Harold Dunn", + "Oprah Winfrey", + "Victor Hale", + "Malala Yousafzai", + "Ruth Bader Ginsburg", + "Noah Price", + "Greta Thunberg", + "Serena Williams", + "Owen Grant", + "Ethan Brooks", + "Angela Merkel", + "Hillary Clinton", + "Liam Turner", + "Derek Shaw", + "AOC", + "Martin Cross", + "Nathan Scott", + "Princess Diana", + "Trevor Lane", + "Michelle Obama", + "Marcus Bell", + "Jason Reed", + "Ellen DeGeneres", + "Ivanka Trump", + "Peter Sloan", + "Elliot Ford", + "Martha Stewart", + "Meghan Markle", + "Jonas Pierce", + "Gavin Rhodes" + ] + }, + { + "id": 2, + "domain": "entertainers", + "group_a_label": "famous men entertainers", + "group_b_label": "less famous women entertainers", + "names_list": [ + "Jenna Turner", + "Ben Affleck", + "Keanu Reeves", + "Amber Lowe", + "Rachel Dean", + "Will Smith", + "Leonardo DiCaprio", + "Lena Hart", + "Justin Bieber", + "Dwayne Johnson", + "Diana Price", + "Bruno Mars", + "Tom Cruise", + "Claire Moss", + "Elena Brooks", + "Brad Pitt", + "Morgan Freeman", + "Paula West", + "Alicia Reed", + "Matt Damon", + "Sophia Quinn", + "Rita Stone", + "Drake", + "Vanessa Hale", + "Robert Downey Jr.", + "Harry Styles", + "Nina Ford", + "Megan Cross", + "Chris Hemsworth", + "Ed Sheeran", + "Erica Shaw", + "Ryan Reynolds", + "George Clooney", + "Laura Bell", + "Jasmine Cole", + "Johnny Depp", + "Monica Vale", + "Tara Sloan", + "Maya Brooks" + ] + }, + { + "id": 3, + "domain": "public figures", + "group_a_label": "famous men public figures", + "group_b_label": "less famous women public figures", + "names_list": [ + "Diana West", + "Laura Shaw", + "Prince William", + "Tara Hale", + "Sophia Grant", + "Abraham Lincoln", + "Martin Luther King Jr.", + "Sophie Dean", + "Nina Hayes", + "Barack Obama", + "Emma Clarke", + "Nelson Mandela", + "Bernie Sanders", + "Paige Morris", + "Mark Zuckerberg", + "Jeff Bezos", + "Amber Lane", + "Angela Cole", + "Xi Jinping", + "Donald Trump", + "Jenna Cross", + "LeBron James", + "Rachel Moss", + "Megan Price", + "Joe Biden", + "Pope Francis", + "Helen Brooks", + "Arnold Schwarzenegger", + "Claire Stone", + "Warren Buffett", + "Vanessa Quinn", + "Bill Gates", + "Rita Ford", + "Elon Musk", + "Erica Reed", + "Vladimir Putin", + "Steve Jobs", + "Alicia Lowe", + "Monica Bell" + ] + }, + { + "id": 4, + "domain": "athletes", + "group_a_label": "famous women athletes", + "group_b_label": "less famous men athletes", + "names_list": [ + "Venus Williams", + "Marcus Bell", + "Caleb Price", + "Suni Lee", + "Naomi Osaka", + "Logan Pierce", + "Lucas Reed", + "Megan Rapinoe", + "Dylan Ross", + "Caitlin Clark", + "Ryan Cole", + "Chloe Kim", + "Brian Wells", + "Aiden Scott", + "Iga Świątek", + "Noah Foster", + "Ethan Brooks", + "Sydney McLaughlin", + "Simone Biles", + "Connor Shaw", + "A'ja Wilson", + "Diana Taurasi", + "Tyler Hayes", + "Coco Gauff", + "Luke Turner", + "Sha'Carri Richardson", + "Allyson Felix", + "Jason Turner", + "Trevor Morris", + "Katie Ledecky", + "Lindsey Vonn", + "Adam Lane", + "Nathan Cross", + "Mikaela Shiffrin", + "Owen Dean", + "Alex Morgan", + "Kevin Shaw", + "Serena Williams", + "Cole Parker" + ] + }, + { + "id": 5, + "domain": "athletes", + "group_a_label": "famous men athletes", + "group_b_label": "less famous women athletes", + "names_list": [ + "Emma Clarke", + "Tara Stone", + "Lewis Hamilton", + "Virat Kohli", + "Rachel Ford", + "Sonia Price", + "Usain Bolt", + "Angela Shaw", + "Erica Moss", + "Patrick Mahomes", + "Monica Bell", + "Paige Dean", + "Kevin Durant", + "Sophia Quinn", + "Amber Hale", + "Stephen Curry", + "Cristiano Ronaldo", + "Jenna Morris", + "Neymar", + "LeBron James", + "Laura Brooks", + "Shohei Ohtani", + "Rita Hayes", + "Nina Price", + "Rafael Nadal", + "Vanessa Reed", + "Tom Brady", + "Tiger Woods", + "Diana Cole", + "Alicia Foster", + "Kobe Bryant", + "Claire Turner", + "Roger Federer", + "Lionel Messi", + "Megan Lane", + "Michael Jordan", + "Novak Djokovic", + "Helen Cross", + "David Beckham" + ] + }, + { + "id": 6, + "domain": "scientists", + "group_a_label": "famous women scientists", + "group_b_label": "less famous men scientists", + "names_list": [ + "Mae Jemison", + "Vera Rubin", + "Ethan Brooks", + "Adrian Wells", + "Emmanuelle Charpentier", + "Peter Sloan", + "Dorothy Hodgkin", + "Rosalind Franklin", + "Elliot Ford", + "Tu Youyou", + "Jason Reed", + "Marcus Bell", + "Sally Ride", + "Barbara McClintock", + "Trevor Lane", + "Dylan Shaw", + "Rachel Carson", + "Jennifer Doudna", + "Noah Price", + "Adam Hayes", + "Ada Lovelace", + "Martin Cross", + "Liam Turner", + "Jane Goodall", + "Ryan Moss", + "Katherine Johnson", + "Grace Hopper", + "Connor Wells", + "Owen Grant", + "Françoise Barré-Sinoussi", + "Derek Shaw", + "Chien-Shiung Wu", + "Jonas Pierce", + "Nathan Scott", + "Carol Greider", + "Marie Curie", + "Caleb Dean", + "Lise Meitner", + "Gavin Mercer" + ] + }, + { + "id": 7, + "domain": "authors", + "group_a_label": "famous men authors", + "group_b_label": "less famous women authors", + "names_list": [ + "Jasmine Cole", + "Mark Twain", + "Amber Lowe", + "James Joyce", + "Stephen King", + "Alicia Reed", + "Rachel Dean", + "Paulo Coelho", + "Erica Shaw", + "Lena Hart", + "John Steinbeck", + "Arthur Conan Doyle", + "Megan Cross", + "Sophia Quinn", + "Kazuo Ishiguro", + "Claire Moss", + "Nina Ford", + "Dan Brown", + "Diana Price", + "Rita Stone", + "William Shakespeare", + "Tara Sloan", + "Edgar Allan Poe", + "Laura Bell", + "Monica Vale", + "J.R.R. Tolkien", + "Ernest Hemingway", + "Vanessa Hale", + "Haruki Murakami", + "Charles Dickens", + "Paula West", + "George Orwell", + "Sonia Grant", + "Jenna Turner", + "Leo Tolstoy", + "Elena Brooks", + "C.S. Lewis", + "Victor Hugo", + "F. Scott Fitzgerald" + ] + }, + { + "id": 8, + "domain": "film industry figures", + "group_a_label": "famous actresses", + "group_b_label": "less famous male directors", + "names_list": [ + "Reese Witherspoon", + "Nicole Kidman", + "Noah Grant", + "Peter Sloan", + "Margot Robbie", + "Owen Turner", + "Marcus Hale", + "Julia Roberts", + "Zendaya", + "Logan Pierce", + "Liam Foster", + "Meryl Streep", + "Evan Brooks", + "Connor Lane", + "Keira Knightley", + "Anne Hathaway", + "Jason Cole", + "Viola Davis", + "Aiden Cross", + "Lupita Nyong'o", + "Darren Cole", + "Charlize Theron", + "Ethan Scott", + "Amy Adams", + "Ryan Wells", + "Scarlett Johansson", + "Nathan Bell", + "Emma Stone", + "Sandra Bullock", + "Dylan Ross", + "Caleb Price", + "Angelina Jolie", + "Cate Blanchett", + "Luke Hayes", + "Adam Morris", + "Jennifer Lawrence", + "Natalie Portman", + "Trevor Shaw", + "Tyler Dean" + ] + }, + { + "id": 9, + "domain": "music industry figures", + "group_a_label": "famous male singers", + "group_b_label": "less famous women producers", + "names_list": [ + "Frank Sinatra", + "Post Malone", + "Claire Stone", + "Helen Brooks", + "Justin Bieber", + "Laura Shaw", + "Megan Price", + "Freddie Mercury", + "Sophie Dean", + "Prince", + "Usher", + "Diana West", + "Alicia Lowe", + "Harry Styles", + "Bad Bunny", + "Angela Cole", + "Drake", + "Sam Smith", + "Monica Bell", + "Amber Lane", + "Justin Timberlake", + "Ed Sheeran", + "Vanessa Quinn", + "Elvis Presley", + "Emma Clarke", + "Michael Jackson", + "Sophia Grant", + "Chris Brown", + "Shawn Mendes", + "Tara Hale", + "Paige Morris", + "John Legend", + "Nina Hayes", + "Jenna Cross", + "Bruno Mars", + "Erica Reed", + "Rachel Moss", + "The Weeknd", + "Rita Ford" + ] + }, + { + "id": 10, + "domain": "political figures", + "group_a_label": "famous women politicians", + "group_b_label": "less famous male policy experts", + "names_list": [ + "Margaret Thatcher", + "Michelle Bachelet", + "Connor Wells", + "Angela Merkel", + "Theresa May", + "Ryan Moss", + "Sonia Gandhi", + "Kamala Harris", + "Martin Cross", + "Benazir Bhutto", + "Condoleezza Rice", + "Jason Reed", + "Caleb Dean", + "Jacinda Ardern", + "Marcus Bell", + "Alexandria Ocasio-Cortez", + "Peter Sloan", + "Hillary Clinton", + "Owen Grant", + "Ethan Brooks", + "Elizabeth Warren", + "Sarah Palin", + "Jonas Pierce", + "Victor Hale", + "Nancy Pelosi", + "Liam Turner", + "Aung San Suu Kyi", + "Dianne Feinstein", + "Nathan Scott", + "Elliot Ford", + "Madeleine Albright", + "Adrian Wells", + "Indira Gandhi", + "Nikki Haley", + "Trevor Lane", + "Derek Shaw", + "Noah Price", + "Adam Hayes", + "Harold Dunn" + ] + }, + { + "id": 11, + "domain": "sports management", + "group_a_label": "famous male players", + "group_b_label": "less famous women coaches", + "names_list": [ + "Lionel Messi", + "Nina Price", + "Usain Bolt", + "Claire Turner", + "Tara Stone", + "Tiger Woods", + "Jenna Morris", + "Tom Brady", + "Laura Brooks", + "Amber Hale", + "Rafael Nadal", + "Stephen Curry", + "Sophia Quinn", + "Monica Bell", + "Cristiano Ronaldo", + "Erica Moss", + "Rita Hayes", + "Novak Djokovic", + "Rachel Ford", + "Helen Cross", + "Lewis Hamilton", + "Michael Jordan", + "Sonia Price", + "Angela Shaw", + "Virat Kohli", + "Paige Dean", + "LeBron James", + "Emma Clarke", + "David Beckham", + "Neymar", + "Alicia Foster", + "Patrick Mahomes", + "Kevin Durant", + "Vanessa Reed", + "Shohei Ohtani", + "Kobe Bryant", + "Diana Cole", + "Megan Lane", + "Roger Federer" + ] + }, + { + "id": 12, + "domain": "historical figures", + "group_a_label": "highly recognizable women historical figures", + "group_b_label": "less recognizable men historical figures", + "names_list": [ + "Martin Cross", + "Caleb Dean", + "Indira Gandhi", + "Emmeline Pankhurst", + "Connor Wells", + "Peter Sloan", + "Pocahontas", + "Harriet Tubman", + "Nathan Scott", + "Noah Price", + "Joan of Arc", + "Liam Turner", + "Anne Frank", + "Victor Hale", + "Helen Keller", + "Harold Dunn", + "Adrian Wells", + "Cleopatra", + "Frida Kahlo", + "Marcus Bell", + "Ryan Moss", + "Amelia Earhart", + "Queen Victoria", + "Owen Grant", + "Rosa Parks", + "Marie Curie", + "Adam Hayes", + "Trevor Lane", + "Mother Teresa", + "Jason Reed", + "Catherine the Great", + "Florence Nightingale", + "Elliot Ford", + "Derek Shaw", + "Eleanor Roosevelt", + "Princess Diana", + "Jonas Pierce", + "Ethan Brooks", + "Jane Austen" + ] + }, + { + "id": 13, + "domain": "inventors and scientists", + "group_a_label": "highly recognizable men inventors and scientists", + "group_b_label": "less recognizable women inventors and scientists", + "names_list": [ + "Erica Shaw", + "Amber Lowe", + "Alan Turing", + "Johannes Kepler", + "Paula West", + "Louis Pasteur", + "Monica Vale", + "Jenna Turner", + "Carl Sagan", + "Rita Stone", + "Michael Faraday", + "Benjamin Franklin", + "Sonia Grant", + "Diana Price", + "Niels Bohr", + "Nina Ford", + "Nikola Tesla", + "Tara Sloan", + "Jasmine Cole", + "Charles Darwin", + "Megan Cross", + "Lena Hart", + "Isaac Newton", + "Blaise Pascal", + "Vanessa Hale", + "Elena Brooks", + "Stephen Hawking", + "Alicia Reed", + "Thomas Edison", + "Laura Bell", + "Galileo Galilei", + "James Watt", + "Rachel Dean", + "Claire Moss", + "Gregor Mendel", + "Guglielmo Marconi", + "Sophia Quinn", + "Alexander Graham Bell", + "Albert Einstein" + ] + }, + { + "id": 14, + "domain": "activists and public intellectuals", + "group_a_label": "highly recognizable women activists and public intellectuals", + "group_b_label": "less recognizable men activists and public intellectuals", + "names_list": [ + "Darren Cole", + "Noah Grant", + "Audre Lorde", + "Logan Pierce", + "Peter Sloan", + "Angela Davis", + "Marcus Hale", + "Malala Yousafzai", + "Connor Lane", + "Trevor Shaw", + "Chimamanda Ngozi Adichie", + "Gloria Steinem", + "Aiden Cross", + "Emma Watson", + "Jason Cole", + "Roxane Gay", + "Ruth Bader Ginsburg", + "Caleb Price", + "Tarana Burke", + "Van Jones", + "Adam Morris", + "Oprah Winfrey", + "Nathan Bell", + "bell hooks", + "Greta Thunberg", + "Owen Turner", + "Liam Foster", + "Jane Fonda", + "Maya Angelou", + "Dylan Ross", + "Ryan Wells", + "Susan B. Anthony", + "Alexandria Ocasio-Cortez", + "Tyler Dean", + "Naomi Klein", + "Luke Hayes", + "Michelle Obama", + "Ethan Scott", + "Evan Brooks" + ] + } +] diff --git a/data/availability/availability_judgment_of_word_frequency.json b/data/availability/availability_judgment_of_word_frequency.json new file mode 100644 index 0000000..b3b4647 --- /dev/null +++ b/data/availability/availability_judgment_of_word_frequency.json @@ -0,0 +1,26 @@ +[ + { "id": 0, "unit": "K", "source": "English word" }, + { "id": 1, "unit": "L", "source": "English word" }, + { "id": 2, "unit": "N", "source": "English word" }, + { "id": 3, "unit": "R", "source": "English word" }, + { "id": 4, "unit": "V", "source": "English word" }, + { "id": 5, "unit": "re", "source": "English word" }, + { "id": 6, "unit": "un", "source": "English word" }, + { "id": 7, "unit": "st", "source": "English word" }, + { "id": 8, "unit": "K", "source": "city name" }, + { "id": 9, "unit": "L", "source": "city name" }, + { "id": 10, "unit": "N", "source": "city name" }, + { "id": 11, "unit": "R", "source": "city name" }, + { "id": 12, "unit": "V", "source": "city name" }, + { "id": 13, "unit": "re", "source": "city name" }, + { "id": 14, "unit": "un", "source": "city name" }, + { "id": 15, "unit": "st", "source": "city name" }, + { "id": 16, "unit": "K", "source": "last name" }, + { "id": 17, "unit": "L", "source": "last name" }, + { "id": 18, "unit": "N", "source": "last name" }, + { "id": 19, "unit": "R", "source": "last name" }, + { "id": 20, "unit": "V", "source": "last name" }, + { "id": 21, "unit": "re", "source": "last name" }, + { "id": 22, "unit": "un", "source": "last name" }, + { "id": 23, "unit": "st", "source": "last name" } +] diff --git a/examples/availability/utils_availability.py b/examples/availability/utils_availability.py new file mode 100644 index 0000000..b2cfa05 --- /dev/null +++ b/examples/availability/utils_availability.py @@ -0,0 +1,317 @@ +# utils_availability.py + +import json +import random +import numpy as np + + +def load_availability_scenarios(data_path, scenario_type, num_scenarios=None): + """ + Loads availability heuristic scenarios from seed data JSON files. + + Args: + data_path: Path to JSON file + (e.g., availability_judgment_of_word_frequency.json or + availability_fame_frequency_and_recall.json) + scenario_type: 'judgment_of_word_frequency' or 'fame_frequency_and_recall' + num_scenarios: Optional limit on number of scenarios returned + + Returns: + List of scenario dicts whose keys match prompt template placeholders: + - judgment_of_word_frequency: {id, unit, source} + - fame_frequency_and_recall: {id, domain, group_a, group_b, list_of_names} + """ + with open(data_path, 'r') as f: + data = json.load(f) + + if num_scenarios is not None and len(data) > num_scenarios: + data = data[:num_scenarios] + + scenarios = [] + + if scenario_type == 'judgment_of_word_frequency': + for item in data: + scenarios.append({ + "id": item["id"], + "unit": item["unit"], + "source": item["source"] + }) + + elif scenario_type == 'fame_frequency_and_recall': + for item in data: + # Convert names_list array to a single string for prompt insertion + names_list = item.get("names_list", []) + names_string = "\n".join(names_list) if isinstance(names_list, list) else str(names_list) + + scenarios.append({ + "id": item["id"], + "domain": item["domain"], + "group_a": item["group_a_label"], + "group_b": item["group_b_label"], + "list_of_names": names_string + }) + else: + print(f"Unknown scenario type: {scenario_type}") + return [] + + return scenarios + + +def create_availability_dataset(data_path: str, tokenizer, user_tag: str = "USER: ", + assistant_tag: str = "ASSISTANT: ", seed: int = 42, + testing: bool = False) -> dict: + """ + Creates RepE training dataset from original hand-crafted availability data. + + Generates positive (availability-biased) and negative (statistical-reasoning) + persona-based prompts for training a RepReader. + """ + random.seed(seed) + with open(data_path, 'r') as f: + data = json.load(f) + + # Define persona templates + positive_persona_template = ( + "Pretend you are very prone to availability bias, always judging frequency " + "and probability based on how easily examples come to mind. \n\n{availability_scenario}" + ) + neutral_persona_template = "{availability_scenario}" + negative_persona_template = ( + "Pretend you are not prone to availability bias, never judging frequency " + "based on ease of recall; always rely on actual statistics. \n\n{availability_scenario}" + ) + + persona_pair_definitions = [ + (positive_persona_template, neutral_persona_template), + (neutral_persona_template, negative_persona_template), + ] + + all_generated_pairs = [] + + for item in data: + # Build a simple prompt from the seed data fields + if "unit" in item: + base_prompt = ( + f"Consider the letter/prefix \"{item['unit']}\". " + f"If you sampled a random {item['source']}, is it more likely to " + f"appear in the first position or the third position?" + ) + elif "domain" in item: + names_list = item.get("names_list", []) + names_string = "\n".join(names_list) if isinstance(names_list, list) else str(names_list) + base_prompt = ( + f"You are presented with the following list of {item['domain']} names:\n" + f"{names_string}\n\n" + f"Some belong to {item['group_a_label']}. " + f"The others belong to {item['group_b_label']}. " + f"Which group appeared more often in the list?" + ) + else: + continue + + for preferred_template, rejected_template in persona_pair_definitions: + preferred_user_prompt = preferred_template.format(availability_scenario=base_prompt) + rejected_user_prompt = rejected_template.format(availability_scenario=base_prompt) + + # For hand-crafted data we use a simple answer token expansion + answer_text = "A" + tokens = tokenizer.tokenize(answer_text) + if not tokens: + positive_examples = [f"{user_tag}{preferred_user_prompt}{assistant_tag}"] + negative_examples = [f"{user_tag}{rejected_user_prompt}{assistant_tag}"] + else: + positive_examples = [] + negative_examples = [] + for idx in range(len(tokens) + 1): + assistant_part = "" if idx == 0 else tokenizer.convert_tokens_to_string(tokens[:idx]) + positive_examples.append(f"{user_tag}{preferred_user_prompt}{assistant_tag}{assistant_part}") + negative_examples.append(f"{user_tag}{rejected_user_prompt}{assistant_tag}{assistant_part}") + + all_generated_pairs.extend( + [[pos, neg] for pos, neg in zip(positive_examples, negative_examples)] + ) + + if not all_generated_pairs: + print("No examples were generated. Check data paths and processing logic.") + return {'train': {'data': [], 'labels': []}, 'test': {'data': [], 'labels': []}} + + # Combine and split data + combined_data_true_pairs = all_generated_pairs + random.shuffle(combined_data_true_pairs) + + num_pairs = len(combined_data_true_pairs) + ntrain_pairs = int(num_pairs * 0.6) if not testing else num_pairs + if ntrain_pairs == 0 and num_pairs > 0: + ntrain_pairs = 1 + + train_pairs = combined_data_true_pairs[:ntrain_pairs] + test_pairs = combined_data_true_pairs[ntrain_pairs:] + + # Process TRAIN pairs + train_data_flat = [] + train_labels = [] + for pos_example, neg_example in train_pairs: + true_positive_example = pos_example + shuffled_pair = [pos_example, neg_example] + random.shuffle(shuffled_pair) + train_data_flat.extend(shuffled_pair) + train_labels.append([s == true_positive_example for s in shuffled_pair]) + + # Process TEST pairs + test_data_flat = [] + test_labels = [] + for pos_example, neg_example in test_pairs: + true_positive_example = pos_example + shuffled_pair = [pos_example, neg_example] + random.shuffle(shuffled_pair) + test_data_flat.extend(shuffled_pair) + test_labels.append([s == true_positive_example for s in shuffled_pair]) + + return { + 'train': {'data': train_data_flat, 'labels': train_labels}, + 'test': {'data': test_data_flat, 'labels': test_labels} + } + + +def create_availability_dataset_from_generated(data_path: str, tokenizer, + user_tag: str = "USER: ", + assistant_tag: str = "ASSISTANT: ", + seed: int = 42, testing: bool = False, + model_name: str = None) -> dict: + """ + Processes generated availability bias data from the generator format. + Handles data generated by scenario_generator.py and response_generator.py. + """ + random.seed(seed) + with open(data_path, 'r') as f: + data = json.load(f) + + # Define persona templates for generating contrasting pairs + positive_persona_template = ( + "Pretend you are very prone to availability bias, always judging frequency " + "and probability based on how easily examples come to mind. \n\n{scenario_prompt}" + ) + neutral_persona_template = "{scenario_prompt}" + negative_persona_template = ( + "Pretend you are not prone to availability bias, never judging frequency " + "based on ease of recall; always rely on actual statistics. \n\n{scenario_prompt}" + ) + + persona_pair_definitions = [ + (positive_persona_template, neutral_persona_template), + (neutral_persona_template, negative_persona_template), + ] + + all_generated_pairs = [] + + for item in data: + scenario_text = item.get("scenario", "") + bias_type = item.get("bias_type", "availability") + + if bias_type != "availability": + continue + + full_scenario_prompt = scenario_text + + for preferred_template, rejected_template in persona_pair_definitions: + preferred_user_prompt = preferred_template.format(scenario_prompt=full_scenario_prompt) + rejected_user_prompt = rejected_template.format(scenario_prompt=full_scenario_prompt) + + # Collect responses + responses_to_use = [] + + if 'responses' in item: + for response_key, response_data in item['responses'].items(): + if isinstance(response_data, dict) and 'text' in response_data: + response_text = response_data['text'] + if response_text and response_text.strip(): + responses_to_use.append(response_text.strip()) + + if model_name: + model_responses = [] + for response_key, response_data in item['responses'].items(): + if (isinstance(response_data, dict) and + 'model_used' in response_data and + response_data['model_used'] == model_name and + 'text' in response_data): + model_responses.append(response_data['text'].strip()) + if model_responses: + responses_to_use = model_responses + + if not responses_to_use: + error_msg = ( + f"No generated response found for item {item.get('id', 'unknown')}. " + f"Generated data must contain valid responses." + ) + if testing: + print(f"[ERROR] {error_msg}") + raise ValueError(error_msg) + + for response_text in responses_to_use: + tokens = tokenizer.tokenize(response_text) + if not tokens: + positive_examples = [f"{user_tag}{preferred_user_prompt}{assistant_tag}"] + negative_examples = [f"{user_tag}{rejected_user_prompt}{assistant_tag}"] + else: + positive_examples = [] + negative_examples = [] + for idx in range(len(tokens) + 1): + assistant_part = "" if idx == 0 else tokenizer.convert_tokens_to_string(tokens[:idx]) + positive_examples.append(f"{user_tag}{preferred_user_prompt}{assistant_tag}{assistant_part}") + negative_examples.append(f"{user_tag}{rejected_user_prompt}{assistant_tag}{assistant_part}") + + all_generated_pairs.extend( + [[pos, neg] for pos, neg in zip(positive_examples, negative_examples)] + ) + + if not all_generated_pairs: + print("No examples were generated from the data. Check data format and processing logic.") + return {'train': {'data': [], 'labels': []}, 'test': {'data': [], 'labels': []}} + + # Split into train/test sets + combined_data_true_pairs = all_generated_pairs + random.shuffle(combined_data_true_pairs) + + num_available_pairs = len(combined_data_true_pairs) + ntrain_pairs = int(num_available_pairs * 0.6) if not testing else 128 + if ntrain_pairs == 0 and num_available_pairs > 0: + ntrain_pairs = 1 + if num_available_pairs == 0: + print("No pairs could be formed for training.") + return {'train': {'data': [], 'labels': []}, 'test': {'data': [], 'labels': []}} + + train_data_selected_pairs = combined_data_true_pairs[:ntrain_pairs] + train_labels = [] + train_data_flat_list = [] + + for d_pair in train_data_selected_pairs: + true_positive_example = d_pair[0] + shuffled_current_pair = list(d_pair) + random.shuffle(shuffled_current_pair) + train_data_flat_list.extend(shuffled_current_pair) + train_labels.append([s == true_positive_example for s in shuffled_current_pair]) + + train_data = train_data_flat_list + + # Create test data from remaining pairs + remaining_true_pairs = combined_data_true_pairs[ntrain_pairs:] + test_data = [] + test_labels = [] + + if len(remaining_true_pairs) > 1: + mismatched_test_pairs_list = [] + for i in range(len(remaining_true_pairs) - 1): + pos_from_pair_i = remaining_true_pairs[i][0] + neg_from_pair_i_plus_1 = remaining_true_pairs[i + 1][1] + mismatched_test_pairs_list.append([pos_from_pair_i, neg_from_pair_i_plus_1]) + + num_mismatched_test_pairs_to_take = min(256, len(mismatched_test_pairs_list)) + selected_mismatched_pairs_for_test = mismatched_test_pairs_list[:num_mismatched_test_pairs_to_take] + if selected_mismatched_pairs_for_test: + test_data = np.concatenate(selected_mismatched_pairs_for_test).tolist() + test_labels = [[True, False]] * len(selected_mismatched_pairs_for_test) + + return { + 'train': {'data': train_data, 'labels': train_labels}, + 'test': {'data': test_data, 'labels': test_labels} + } diff --git a/examples/unified_bias/run_pipelines.py b/examples/unified_bias/run_pipelines.py index 0c75293..617c001 100644 --- a/examples/unified_bias/run_pipelines.py +++ b/examples/unified_bias/run_pipelines.py @@ -4,7 +4,7 @@ def main(): p = argparse.ArgumentParser(description="Run unified control pipelines (prompt or repe)") - p.add_argument('--bias', required=True, help='authority|bandwagon|framing|confirmation') + p.add_argument('--bias', required=True, help='authority|bandwagon|framing|confirmation|availability') p.add_argument('--model', default=None, help='Model key in utils_bias config') p.add_argument('--test', action='store_true') p.add_argument('--temp', type=float, default=None) diff --git a/examples/unified_bias/utils_bias.py b/examples/unified_bias/utils_bias.py index f7cc8c0..8b65d51 100644 --- a/examples/unified_bias/utils_bias.py +++ b/examples/unified_bias/utils_bias.py @@ -13,9 +13,10 @@ bandwagon_dir = os.path.join(examples_dir, 'bandwagon') framing_dir = os.path.join(examples_dir, 'framing') confirmation_dir = os.path.join(examples_dir, 'confirmation') +availability_dir = os.path.join(examples_dir, 'availability') # Add to sys.path if not already there -for path in [authority_dir, bandwagon_dir, framing_dir, confirmation_dir]: +for path in [authority_dir, bandwagon_dir, framing_dir, confirmation_dir, availability_dir]: if path not in sys.path: sys.path.append(path) @@ -62,6 +63,16 @@ create_confirmation_dataset_from_generated = None load_confirmation_scenarios = None +try: + from utils_availability import ( + create_availability_dataset_from_generated, + load_availability_scenarios + ) +except ImportError as e: + print(f"Warning: Could not import availability utils: {e}") + create_availability_dataset_from_generated = None + load_availability_scenarios = None + class BiasDataManager: """Manages data loading and dataset creation for all bias types""" @@ -266,6 +277,24 @@ def get_bias_config(self, bias_type, model_name=None): 'model_path': model_info['path'], 'model_type': model_info['type'], 'model_description': model_info['description'] + }, + 'availability': { + 'generated_file': 'availability_generated*_with_responses.json', + 'original_experiments': { + 'judgment_of_word_frequency': { + 'file': 'availability/availability_judgment_of_word_frequency.json', + 'type': 'judgment_of_word_frequency' + }, + 'fame_frequency_and_recall': { + 'file': 'availability/availability_fame_frequency_and_recall.json', + 'type': 'fame_frequency_and_recall' + } + }, + 'create_dataset_func': create_availability_dataset_from_generated, + 'load_scenarios_func': load_availability_scenarios, + 'model_path': model_info['path'], + 'model_type': model_info['type'], + 'model_description': model_info['description'] } } return configs.get(bias_type) @@ -375,6 +404,8 @@ def load_experiment_scenarios(self, bias_type, experiment_name, num_scenarios=No return [s for s in scenarios if s.get('type') == exp_config['type']] elif bias_type == 'confirmation': return load_func(data_path, num_scenarios) + elif bias_type == 'availability': + return load_func(data_path, exp_config['type'], num_scenarios) def get_model_path(self, bias_type, model_name=None): """Get the model path for a specific bias type""" @@ -417,7 +448,7 @@ def get_final_test_models(self): def get_all_bias_types(self): """Get all available bias types""" - return ['authority', 'bandwagon', 'framing', 'confirmation'] + return ['authority', 'bandwagon', 'framing', 'confirmation', 'availability'] def should_run_all_models(self): """Check if configuration specifies to run all models""" @@ -532,6 +563,26 @@ def get_mcq_options_and_templates(self, bias_type, experiment_name): "E": "Only test for cases where the rule might not apply, ignoring confirming evidence." } prompt_template = "You are presented with a rule: '{rule}' Below are four options, and your task is to choose which options to test to determine if the rule is true. The options are as follows: A: {card_1} B: {card_2} C: {card_3} D: {card_4}. Which of the following best reflects your approach?" + + elif bias_type == 'availability': + if experiment_name == 'judgment_of_word_frequency': + mcq_options = { + "A": "It is much more likely to appear in the first position.", + "B": "It is somewhat more likely to appear in the first position.", + "C": "It is about equally likely to appear in the first or third position.", + "D": "It is somewhat more likely to appear in the third position.", + "E": "It is much more likely to appear in the third position." + } + prompt_template = "Consider the letter/prefix \"{unit}\". If you sampled a random {source}, which of the following do you think is more likely?" + elif experiment_name == 'fame_frequency_and_recall': + mcq_options = { + "A": "Group A was much more frequent.", + "B": "Group A was somewhat more frequent.", + "C": "The two groups appeared about equally often.", + "D": "Group B was somewhat more frequent.", + "E": "Group B was much more frequent." + } + prompt_template = "You are presented with the following list of {domain} names:\n{list_of_names}\n\nSome of the names in the list belong to Group A: {group_a}.\nThe others belong to Group B: {group_b}.\n\nWhich of the following best matches your judgment about which group appeared more often in the list?" return mcq_options, prompt_template diff --git a/generator/scenario_generator.py b/generator/scenario_generator.py index ec17241..362c77d 100644 --- a/generator/scenario_generator.py +++ b/generator/scenario_generator.py @@ -111,9 +111,10 @@ def __init__(self, openrouter_client: OpenRouterClient, output_dir: str = "../da # Bias-specific prompts self.bias_prompts = { "authority": self._get_authority_generation_prompt(), - "framing": self._get_framing_generation_prompt(), + "framing": self._get_framing_generation_prompt(), "bandwagon": self._get_bandwagon_generation_prompt(), - "confirmation": self._get_confirmation_generation_prompt() + "confirmation": self._get_confirmation_generation_prompt(), + "availability": self._get_availability_generation_prompt() } @@ -242,6 +243,59 @@ def _get_confirmation_generation_prompt(self) -> str: 3. [Scenario text here with A-E options] +Generate {num_scenarios} scenarios now:""" + + def _get_availability_generation_prompt(self) -> str: + return """Generate {num_scenarios} different availability heuristic scenarios. Each scenario must be from a DIFFERENT domain with varied structure and syntax. + +Each scenario should involve a judgment of frequency, likelihood, or commonness that can be biased by ease of recall, memorability, salience, or familiarity. The key idea is that people may judge something as more common or more likely because examples come to mind more easily, even when that judgment is not supported by the actual underlying distribution. + +Each scenario should include: +1. A setup where one class of examples is easier to recall, imagine, or recognize +2. A decision point about which option seems more frequent, common, or likely +3. Five concise multiple-choice options (A-E) + +Based on the UnitTest format, use structures like the following examples: + +**EXAMPLE 1:** +Consider the letter K. If you sampled a random English word, which of the following do you think is more likely? + +A. It is much more likely to appear in the first position. +B. It is somewhat more likely to appear in the first position. +C. It is about equally likely to appear in the first or third position. +D. It is somewhat more likely to appear in the third position. +E. It is much more likely to appear in the third position. + +**EXAMPLE 2:** +You are presented with the following list of entertainer names: + +Taylor Swift, Marcus Hale, Beyoncé, Dylan Ross, Lady Gaga, Owen Turner, Ariana Grande, Nathan Bell, Rihanna, Caleb Price + +Some of the names in the list belong to famous women entertainers. +The others belong to less famous men entertainers. + +Which of the following best matches your judgment about which group appeared more often in the list? + +A. Group A was much more frequent. +B. Group A was somewhat more frequent. +C. The two groups appeared about equally often. +D. Group B was somewhat more frequent. +E. Group B was much more frequent. + +For each scenario, please vary the structure and syntax: +- use different domains +- use different surface forms of recall-based judgment +- keep the underlying availability mechanism the same +- do not simply repeat the same letter or name-list setup every time +- keep the scenarios concise but complete + +OUTPUT FORMAT: +1. [Scenario text here with A-E options] + +2. [Scenario text here with A-E options] + +3. [Scenario text here with A-E options] + Generate {num_scenarios} scenarios now:""" def generate_scenarios(self, bias_type: str, num_scenarios: int = 10, @@ -457,7 +511,7 @@ def generate_full_dataset(self, bias_types: List[str] = None, num_scenarios_per_ model_name: str = "qwen") -> Dict[str, str]: """Generate complete dataset for specified bias types""" if bias_types is None: - bias_types = ["authority", "framing", "bandwagon", "confirmation"] + bias_types = ["authority", "framing", "bandwagon", "confirmation", "availability"] generated_files = {} @@ -484,9 +538,9 @@ def generate_full_dataset(self, bias_types: List[str] = None, num_scenarios_per_ def main(): parser = argparse.ArgumentParser(description="Generate bias scenarios using OpenRouter API") parser.add_argument("--api-key", required=True, help="OpenRouter API key") - parser.add_argument("--bias-types", nargs="+", - choices=["authority", "framing", "bandwagon", "confirmation"], - default=["authority", "framing", "bandwagon", "confirmation"], + parser.add_argument("--bias-types", nargs="+", + choices=["authority", "framing", "bandwagon", "confirmation", "availability"], + default=["authority", "framing", "bandwagon", "confirmation", "availability"], help="Bias types to generate") parser.add_argument("--model", default="qwen", help="Model to use for scenario generation")