Module: analyse

In many ways, the analyse module just encapsulates all the analysis. You don't need to run it at all by itself. I find it more useful to run post with dry-run set to true and look at the results that way.


Code Reference

Module for analyzing toots for a hashtag. Reads a JSON dump of toots presumably written by the fetch() function.

analyse(config)

Does a bunch of analysis over the toots. Returns a dict with the results suitable for sending to post(). The whole process is described in more detail in the methodology documentation.

Parameters

  • config: A ConfigParser object from the config module

Config Parameters Used

Option Description
analyse:botusername Exclude toots from this ID
analyse:journaldir Directory to read JSON files from
analyse:journalfile Template for files to read
analyse:top_n How many top toots to report
analyse:timezone What timezone to convert times to
analyse:tag_users Whether we tag users with an @ or not

Returns

Dict that includes a few elements: - preamble: A bit of information about the analysis. Hashtag and when it was generated. - num_toots: A few lines of text that describe the analysis: total number of toots, servers, participants, etc. - most_toots: A line about the person that posted the most toots. - max_boosts, max_faves, and max_replies: pandas DataFrames that contain the top_n toots in each of these categories.

Source code in mastoscore/analyse.py
def analyse(config: ConfigParser) -> None:
    """
    Does a bunch of analysis over the toots. Returns a dict with the results suitable for
    sending to [post()](module-post.md). The whole process is described in more detail in
    the [methodology documentation](../methodology.md).

    # Parameters
    - **config**: A ConfigParser object from the [config](module-config.md) module

    # Config Parameters Used

    | Option | Description |
    | ------- | ------- |
    | `analyse:botusername` | Exclude toots from this ID |
    | `analyse:journaldir` | Directory to read JSON files from  |
    | `analyse:journalfile` | Template for files to read  |
    | `analyse:top_n` | How many top toots to report |
    | `analyse:timezone` | What timezone to convert times to |
    | `analyse:tag_users` | Whether we tag users with an @ or not |

    # Returns

    Dict that includes a few elements:
    - `preamble`: A bit of information about the analysis. Hashtag and when it was generated.
    - `num_toots`: A few lines of text that describe the analysis: total number of toots, servers, participants, etc.
    - `most_toots`: A line about the person that posted the most toots.
    - `max_boosts`, `max_faves`, and `max_replies`: pandas DataFrames that contain the `top_n` toots in each of these categories.

    """
    hashtag = config.get('analyse', 'hashtag')
    debug = config.getint('analyse', 'debug')
    top_n = config.getint('analyse', 'top_n')
    timezone = config.get('analyse', 'timezone')
    tag_users = config.getboolean('analyse', 'tag_users')

    logger = logging.getLogger(__name__)
    logging.basicConfig(format='%(levelname)s\t%(message)s')
    logger.setLevel(debug)

    df = get_toots_df(config)
    if len(df) <= 0:
       return None

    earliest, latest = get_dates(config)
    analysis = dict()
    # some old data files don't have data for fields we expect
    df = df.replace(nan, None)
    # top poster
    most_toots_id = df['userid'].value_counts().idxmax()
    most_toots_name = df.loc[df['userid'] == most_toots_id][:1]['account.display_name'].values[0]
    most_toots_count = len(df.loc[df['userid'] == most_toots_id])

    # Some overall statistics
    num_servers = df['server'].nunique()
    max_server = df['server'].value_counts().idxmax()
    max_server_toots = len(df.loc[df['server'] == max_server])

    # do the max_boosts stuff last because it is destructive. I remove selected toots
    # from the dataframe so that they can't appear twice. i.e., if you're the most
    # boosted toot, you're taken out of the running for most favourites and most replies,
    # even if you DO have the most favourites and most replies.
    maxdf = df.copy(deep=True)
    max_boosts = maxdf.sort_values(
        by=['reblogs_count', 'favourites_count', 'replies_count'], ascending=False).head(top_n)

    # drop from df all the toots that are in the max_boosts df
    maxdf.drop(maxdf[maxdf['uri'].isin(max_boosts['uri'])].index, inplace=True)
    max_faves = maxdf.sort_values(
        by=['favourites_count', 'reblogs_count', 'replies_count'], ascending=False).head(top_n)

    # drop from df all the toots that are in the max_faves df
    maxdf.drop(maxdf[maxdf['uri'].isin(max_faves['uri'])].index, inplace=True)

    # Count how many replies to each post are from the post's original author
    # Group by the original post ID and count replies where the author is the same
    if 'in_reply_to_account_id' in df.columns and 'account.id' in df.columns:
    # Create a copy of the dataframe to work with for calculating external replies
        replies_df = df.copy()
        logger.debug(f"removing self-replies")

        # Create a new column for external replies (total replies minus self-replies)
        # First, identify self-replies (where author replies to their own post)
        self_replies = replies_df[replies_df['in_reply_to_account_id'] == replies_df['account.id']]

        # Count self-replies per original post
        self_reply_counts = self_replies.groupby('in_reply_to_account_id').size().reset_index(name='self_reply_count')

        # Merge this count back to the main dataframe
        replies_df = replies_df.merge(self_reply_counts, left_on='account.id', right_on='in_reply_to_account_id', how='left')

        # Fill NaN values with 0 (posts with no self-replies)
        replies_df['self_reply_count'] = replies_df['self_reply_count'].fillna(0)

        # Calculate external replies (total replies minus self-replies)
        replies_df['external_replies_count'] = replies_df['replies_count'] - replies_df['self_reply_count']

        # Sort by external replies count instead of total replies
        max_replies = replies_df.sort_values(
            by=['external_replies_count', 'reblogs_count', 'favourites_count'], ascending=False).head(top_n)
        print(replies_df[['id', 'replies_count', 'self_reply_count', 'external_replies_count']])
    else:
        # Fallback to original behavior if we don't have the necessary columns
        logger.debug(f"NOT removing self-replies")
        logger.debug(df.columns.tolist())
        max_replies = df.sort_values(
            by=['replies_count', 'reblogs_count', 'favourites_count'], ascending=False).head(top_n)
    # Prepare the analysis
    # convert config strings into datetime structs
    tag = "@" if tag_users else ""
    timezone = pytimezone(timezone)
    start_time = earliest.strftime("%a %e %b %Y %H:%M %Z")
    end_time = latest.strftime("%a %e %b %Y %H:%M %Z")
    right_now = datetime.datetime.now(
        tz=timezone).strftime("%a %e %b %Y %H:%M %Z")
    analysis['preamble'] = f"<p>Summary of #{hashtag} generated at {right_now}.</p>"
    analysis['num_toots'] = f"We looked at {len(df)} toots posted between {start_time} and "\
        f"{end_time} by {df['userid'].nunique()} " +\
        f"different participants across {num_servers} different servers. {max_server} " +\
        f"contributed the most toots at {max_server_toots}"
    analysis['most_toots'] = f"Most toots were from '{most_toots_name}' ({tag}{most_toots_id}) who posted {most_toots_count}"
    analysis['max_boosts'] = max_boosts.to_dict(orient='records', )
    analysis['max_faves'] = max_faves.to_dict(orient='records')
    analysis['max_replies'] = max_replies.to_dict(orient='records')
    analysis['unique_ids'] = df['userid'].nunique()
    analysis['top_n'] = top_n
    analysis['hashtag'] = hashtag
    analysis['top_n'] = top_n
    analysis['generated'] = right_now
    analysis['event_start'] = start_time
    analysis['gross_toots'] = len(df)
    analysis['event_end'] = end_time
    analysis['num_servers'] = num_servers
    analysis['max_server'] = {}
    analysis['max_server']['name'] = max_server
    analysis['max_server']['num'] = max_server_toots
    analysis['most_posts'] = {}
    analysis['most_posts']['name'] = most_toots_name
    analysis['most_posts']['id'] = most_toots_id
    analysis['most_posts']['count'] = most_toots_count

    write_json(config, 'analysis', analysis)

get_dates(config)

Given a config, it does a bunch of timezone math and returns a list of two datetime objects, the earliest and latest dates we analyse. Right now there's a hard-coded 1-hour time that is subtracted from the start and added to the end. E.g., if you sent start_time = 12:00 and end_time = 14:00, this function currently returns 11:00 and 15:00 as the two times.

Parameters

  • config: A ConfigParser object from the config module

Config Parameters Used

  • analyse:start_time: start time for the event
  • analyse:end_time: end time for the event
  • analyse:timezone: the time zone to localise all the times to
  • analyse:hours_margin: How many days to go back for analysis. You might have fetched toots from days ago, but then want to restrict analysis to toots from today.

Returns

List of two datetime objects: the earliest possible time and latest possible time

Source code in mastoscore/analyse.py
def get_dates(config):
    """
    Given a config, it does a bunch of timezone math and returns a list of
    two datetime objects, the earliest and latest dates we analyse. Right now
    there's a hard-coded 1-hour time that is subtracted from the start and added
    to the end. E.g., if you sent start_time = 12:00 and end_time = 14:00, this function
    currently returns 11:00 and 15:00 as the two times.

    # Parameters
    - **config**: A ConfigParser object from the [config](module-config.md) module

    # Config Parameters Used
    - `analyse:start_time`: start time for the event
    - `analyse:end_time`: end time for the event
    - `analyse:timezone`: the time zone to localise all the times to
    - `analyse:hours_margin`: How many days to go back for analysis. You might have
       fetched toots from days ago, but then want to restrict analysis to toots from today.

    # Returns

    List of two `datetime` objects: the earliest possible time and latest possible time

    """
    start_time = config.get('analyse', 'start_time')
    end_time = config.get('analyse', 'end_time')
    timezone = config.get('analyse', 'timezone')
    hours_margin = config.getint('analyse', 'hours_margin')

    timezone = pytimezone(timezone)
    start_time = datetime.datetime.fromisoformat(
        start_time).astimezone(tz=timezone)
    end_time = datetime.datetime.fromisoformat(
        end_time).astimezone(tz=timezone)
    earliest = start_time - datetime.timedelta(hours=hours_margin)
    latest = end_time + datetime.timedelta(hours=hours_margin)

    return [earliest, latest]

get_toots_df(config)

Opens the journal files from a hierarchical directory structure, parses the toots, and does a bunch of analysis over the toots. Returns a df with the results. This is its own method because the graph() module calls it.

Parameters

  • config: A ConfigParser object from the config module

Config Parameters Used

  • analyse:botusername: Exclude toots from this ID
  • analyse:journaldir: Base directory to read JSON files from
  • analyse:journalfile: Template for files to read
  • analyse:top_n: How many top toots to report
  • mastoscore:event_year: Year of the event (YYYY)
  • mastoscore:event_month: Month of the event (MM)
  • mastoscore:event_day: Day of the event (DD)

Returns

Pandas DataFrame with all the toots pulled in and converted to normalised types.

Source code in mastoscore/analyse.py
def get_toots_df(config) -> pd.DataFrame:
    """
    Opens the journal files from a hierarchical directory structure, parses the toots,
    and does a bunch of analysis over the toots. Returns a df with the results.
    This is its own method because the graph() module calls it.

    # Parameters
    - **config**: A ConfigParser object from the [config](module-config.md) module

    # Config Parameters Used
    - `analyse:botusername`: Exclude toots from this ID
    - `analyse:journaldir`: Base directory to read JSON files from
    - `analyse:journalfile`: Template for files to read
    - `analyse:top_n`: How many top toots to report
    - `mastoscore:event_year`: Year of the event (YYYY)
    - `mastoscore:event_month`: Month of the event (MM)
    - `mastoscore:event_day`: Day of the event (DD)

    # Returns

    Pandas DataFrame with all the toots pulled in and converted to normalised types.
    """
    botusername = config.get('analyse', 'botusername')
    journaldir = config.get('analyse', 'journaldir')
    journalfile = config.get('analyse', 'journalfile')
    debug = config.getint('analyse', 'debug')
    top_n = config.getint('analyse', 'top_n')

    logger = logging.getLogger(__name__)
    logging.basicConfig(format='%(levelname)s\t%(message)s')
    logger.setLevel(debug)

    # Get date components from config
    try:
        year = config.get('mastoscore', 'event_year')
        month = config.get('mastoscore', 'event_month')
        day = config.get('mastoscore', 'event_day')
        date_path = os.path.join(year, month, day)
        logger.info(f"Looking for journal files in date path: {date_path}")
    except Exception as e:
        logger.error(f"Failed to get date components from config: {e}")
        logger.error("Falling back to flat directory structure")
        date_path = ""

    df = pd.DataFrame([])
    # journal is now a template. Read all the matching files into a big data frame
    max_toots = 0
    max_toots_file = "none"
    nfiles = 0

    # Build the path to search for journal files
    if date_path:
        search_path = os.path.join(journaldir, date_path)
        p = Path(search_path).resolve()
        if not p.exists():
            logger.error(f"Directory {search_path} does not exist")
            # Try falling back to the base directory
            logger.info(f"Falling back to base directory: {journaldir}")
            p = Path(journaldir).resolve()
            # Look for files in the hierarchical structure
            pattern = f"**/{journalfile}-*.json"
        else:
            pattern = f"{journalfile}-*.json"
    else:
        p = Path(journaldir).resolve()
        # Look for files in the hierarchical structure
        pattern = f"**/{journalfile}-*.json"

    logger.info(f"Searching for files matching pattern: {pattern} in {p}")
    filelist = list(p.glob(pattern))

    if not filelist:
        logger.warning(f"No files found matching pattern {pattern} in {p}")
        # Try a more general search if specific path failed
        if date_path:
            logger.info("Trying broader search in entire journal directory")
            p = Path(journaldir).resolve()
            filelist = list(p.glob(f"**/{journalfile}-*.json"))

    for jfile in filelist:
        try:
            logger.debug(f"Attempting to read {jfile}")
            newdf = pd.read_json(jfile)
        except Exception as e:
            logger.critical(f"Failed to open {jfile}")
            logger.critical(e)
            continue
        if len(newdf) > max_toots:
            max_toots_file = jfile
            max_toots = len(newdf)
        nfiles = nfiles+1
        df = pd.concat([df, newdf])
        logger.debug(f"Loaded {len(newdf)} toots from {jfile.name}")
        del newdf

    logger.info(f"Loaded {len(df)} total toots from {nfiles} JSON files")
    logger.info(f"Biggest was {max_toots} toots from {max_toots_file}")
    assert(len(df) > 0)
    # Now exclude toots that are too old or too new
    earliest, latest = get_dates(config)
    df = df.loc[df['created_at'] >= earliest]
    df = df.loc[df['created_at'] <= latest]
    # gather up the set we want to work on
    # 1. local toots
    # 2. remote toots where we didn't get a local version
    local_toots = df.loc[df['local'] == True]
    sources = local_toots['source'].unique()
    non_local_toots = df.loc[df['server'] != df['source']]
    # drop all toots from servers we successfully contacted
    non_local_toots = non_local_toots.loc[~non_local_toots['server'].isin(
        sources)]
    # There will be more than one copy of non-local toots.
    # Iterate over each uri, find the copy of it that has the highest numbers
    # and keep it, deleting the others
    non_local_keepers = pd.DataFrame([])
    for uri in non_local_toots['uri'].unique():
        minidf = non_local_toots[non_local_toots['uri'] == uri]
        # logger.debug(f"{len(minidf)} toots for {uri}")
        minidf = minidf.sort_values(
            by=['reblogs_count', 'favourites_count', 'replies_count'], ascending=False).head(1)
        non_local_keepers = pd.concat([non_local_keepers, minidf])
    logger.info(
        f"{len(local_toots)} local toots and {len(non_local_keepers)} non-local toots")
    df = pd.concat([local_toots, non_local_keepers])
    orig_len = len(df)
    df = df[df['userid'] != botusername]
    final_len = len(df)
    logger.debug(
        f"Excluded {orig_len-final_len} posts from ourselves ('{botusername}')")
    # Quick check to make sure we don't have duplicates. Number of rows in the final
    # DataFrame and the number of unique URIs should be the same. If they're not, we
    # have duplicates somewhere.
    num_unique = len(df['uri'].unique())
    if num_unique != final_len:
        logger.error(
            f"We have {final_len} toots, but {num_unique} URIs. Likely duplicates!")
    else:
        logger.debug(
            f"Number of unique URIs ({num_unique}) == Number of rows ({final_len}). All good.")
    return df