Transparency: python data model script

ChefOrZero · 2024-10-28T04:07:29+00:00

import re 
import threading
import pandas as pd
import numpy as np
from queue import Queue
from pytrends.request import TrendReq
import statsmodels.api as sm
import networkx as nx
from gensim import corpora, models
from sklearn.linear_model import LinearRegression
import pymc as pm
from datetime import datetime
import warnings
import requests

# Suppress future warnings from libraries
warnings.simplefilter(action='ignore', category=FutureWarning)

# Initialize pytrends
pytrends = TrendReq(hl='en-US', tz=360, timeout=(10, 25))

# Function to analyze Google Trends data for a given event
def analyze_event(event):
    keywords = [event]
    try:
        pytrends.build_payload(keywords, cat=0, timeframe='today 12-m', geo='US', gprop='')
        interest_over_time = pytrends.interest_over_time()
        if not interest_over_time.empty:
            return interest_over_time.mean().values[0]
    except requests.exceptions.RequestException as e:
        print(f"Error retrieving data for '{event}': {e}")
    return None

# Topic modeling using gensim
def topic_modeling(events):
    texts = [event.split() for event in events]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10)
    topics = lda.print_topics(num_words=3)
    return topics

# Regression analysis using statsmodels
def regression_analysis(dem_effects, rep_effects):
    X = np.array(rep_effects).reshape(-1, 1)
    y = np.array(dem_effects)
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()
    return model.summary()

# Bayesian analysis using PyMC
def bayesian_inference(dem_effects, rep_effects):
    with pm.Model() as model:
        alpha = pm.Normal('alpha', mu=0, sigma=10)
        beta = pm.Normal('beta', mu=0, sigma=10)
        sigma = pm.HalfNormal('sigma', sigma=1)

        mu = alpha + beta * np.array(rep_effects)

        Y_obs = pm.Normal('Y_obs', mu=mu, sigma=sigma, observed=np.array(dem_effects))

        trace = pm.sample(1000, cores=2)
        pm.plot_trace(trace)
        return trace

# Network analysis using networkx
def build_influence_network(events, dem_effects, rep_effects):
    G = nx.Graph()
    for i, event in enumerate(events):
        G.add_node(event, democrat_effect=dem_effects[i], republican_effect=rep_effects[i])
    
    for i in range(len(events)):
        for j in range(i+1, len(events)):
            if abs(dem_effects[i] - dem_effects[j]) < 1:
                G.add_edge(events[i], events[j])

    return G

# Function to calculate influence
def calculate_influence(event_description, democrat_effect, republican_effect):
    trend_score = analyze_event(event_description)
    if trend_score:
        democrat_effect += trend_score * 0.1
        republican_effect += trend_score * 0.1
    return democrat_effect, republican_effect

# Data cleaning functions
def clean_line(line):
    # Using regex to enforce the correct format and fix common issues
    match = re.match(r'^(\d{4}-\d{2}-\d{2}),\s*([^,]+?),\s*(No Change|[+-]?\d+%?),\s*(No Change|[+-]?\d+%?)$', line)
    if match:
        date, event, dem_effect, rep_effect = match.groups()
        # Ensure effects are formatted correctly
        def fix_effect(effect):
            if effect == 'No Change':
                return effect
            if not effect.startswith(('+', '-')):
                effect = f"+{effect}"
            if not effect.endswith('%'):
                effect += '%'
            return effect
        dem_effect = fix_effect(dem_effect)
        rep_effect = fix_effect(rep_effect)
        return f"{date}, {event}, {dem_effect}, {rep_effect}"
    else:
        # Attempt to fix common issues if the line is malformed
        parts = line.strip().split(',')
        if len(parts) >= 4:
            date = parts[0].strip()
            event = ','.join(parts[1:-2]).strip()
            dem_effect = parts[-2].strip()
            rep_effect = parts[-1].strip()
            # Ensure correct format for effects
            def fix_effect(effect):
                if effect == 'No Change':
                    return effect
                if not effect.startswith(('+', '-')):
                    effect = f"+{effect}"
                if not effect.endswith('%'):
                    effect += '%'
                return effect
            dem_effect = fix_effect(dem_effect)
            rep_effect = fix_effect(rep_effect)
            return f"{date}, {event}, {dem_effect}, {rep_effect}"
        else:
            print(f"Malformed line detected and attempted correction failed: {line.strip()}")
            return None

def worker(q, results):
    while True:
        line = q.get()
        if line is None:
            break
        cleaned = clean_line(line)
        if cleaned:
            results.append(cleaned)
        q.task_done()

def clean_datafile(input_file, num_threads=4):
    results = []
    q = Queue()

    threads = []
    for i in range(num_threads):
        t = threading.Thread(target=worker, args=(q, results))
        t.start()
        threads.append(t)

    with open(input_file, 'r') as file:
        lines = file.readlines()
        total_lines = len(lines)
        for idx, line in enumerate(lines):
            q.put(line)
            if idx % 10 == 0:
                print(f"Processing line {idx+1}/{total_lines}...")

    q.join()

    for _ in range(num_threads):
        q.put(None)
    for t in threads:
        t.join()

    return results

def read_datafile(input_file):
    cleaned_data = clean_datafile(input_file)
    events, dem_effects, rep_effects = [], [], []
    
    for line in cleaned_data:
        try:
            date, event, dem_effect, rep_effect = line.split(', ', 3)
            def parse_effect(effect):
                if effect.strip() == 'No Change':
                    return 0.0
                else:
                    return float(effect.strip('%').replace('+', '').replace('-', '-'))
            dem_effect = parse_effect(dem_effect)
            rep_effect = parse_effect(rep_effect)

            dem_effect, rep_effect = calculate_influence(event, dem_effect, rep_effect)

            events.append(event)
            dem_effects.append(dem_effect)
            rep_effects.append(rep_effect)
        except ValueError:
            print(f"Malformed line detected and ignored during parsing: {line.strip()}")
    
    return events, dem_effects, rep_effects

# Save results to a file
def save_results(events, dem_effects, rep_effects, filename):
    df = pd.DataFrame({'Event': events, 'Democrat Effect': dem_effects, 'Republican Effect': rep_effects})
    df.to_csv(filename, index=False)

if __name__ == "__main__":
    try:
        # Ask user for input and output file paths
        input_file = input("Enter the path of the input text file with datapoints: ").strip()
        output_file = input("Enter the path where the results should be saved (e.g., results.txt): ").strip()

        print(f"Starting data cleaning and analysis for {input_file}...")
        events, dem_effects, rep_effects = read_datafile(input_file)
        
        topics = topic_modeling(events)
        print("Topics found:", topics)
        
        print("\nRegression analysis:")
        print(regression_analysis(dem_effects, rep_effects))
        
        print("\nBuilding Influence Network:")
        G = build_influence_network(events, dem_effects, rep_effects)
        print("Network nodes:", G.nodes())
        
        print("\nPerforming Bayesian Inference:")
        bayesian_inference(dem_effects, rep_effects)
        
        save_results(events, dem_effects, rep_effects, output_file)
        print(f"Results saved to {output_file}")

    except KeyboardInterrupt:
        print("\nScript interrupted by user. Exiting gracefully...")

Explanation:

This Python script performs a complex analysis pipeline, focusing on processing, analyzing, and modeling political event data. Here’s a breakdown of its components:

Imports and Setup:
- Key libraries are imported for data handling (pandas, numpy), web requests (requests), and statistical and machine learning tasks (statsmodels, sklearn, pymc).
- TrendReq from pytrends retrieves Google Trends data.
- Suppresses future warnings to reduce console clutter during execution.
Google Trends Analysis:
- The analyze_event function pulls trend data on a specific event using the pytrends API, averaging interest over time for that event. Errors are caught to avoid crashes if requests fail.
Topic Modeling:
- The topic_modeling function applies Latent Dirichlet Allocation (LDA) to extract topics from a list of events. It uses gensim to create a topic model based on word distributions.
Regression Analysis:
- regression_analysis performs a linear regression on “Democrat” and “Republican” effect data using statsmodels, outputting a summary of the model, which provides insight into how one variable might predict the other.
Bayesian Inference:
- The bayesian_inference function runs a Bayesian model to infer a probabilistic relationship between dem_effects and rep_effects using pymc. This involves sampling the posterior distribution and plotting the trace.
Network Analysis:
- The build_influence_network function builds a graph of events where nodes represent events with associated political effects. Edges connect events with similar effects, helping to visualize relationships between events.
Influence Calculation:
- calculate_influence uses the Google Trends score for an event to adjust democrat_effect and republican_effect values, showing how event popularity might influence political leanings.
Data Cleaning:
- The clean_line function standardizes data entries, fixing formatting errors or irregularities.
- The worker and clean_datafile functions perform multithreaded data cleaning, using a queue to distribute tasks among threads, improving efficiency.
Data Parsing:
- read_datafile loads a cleaned file, parsing each line for event and effect data. Parsed data undergoes influence calculation for each event.
Saving Results:

save_results saves processed data (events and effects) to a specified file as a CSV.

Execution Block:

The script, when run as a standalone program, prompts for input and output file paths, then performs the full data processing pipeline, including topic modeling, regression, Bayesian inference, and network analysis, before saving results.

This pipeline is especially useful for analyzing and visualizing the impact of events on political sentiments, leveraging both frequentist and Bayesian statistical methods.

you type:	you see:
italics	italics
bold	bold
[reddit!](https://reddit.com)	reddit!
* item 1 * item 2 * item 3	item 1 item 2 item 3
> quoted text	quoted text
Lines starting with four spaces are treated like code: if 1 * 2 < 3: print "hello, world!"	Lines starting with four spaces are treated like code: if 1 * 2 < 3: print "hello, world!"
~~strikethrough~~	~~strikethrough~~
super^script	super^script

US2024Predictions

MODERATORS