Module: analyse¶
In many ways, the analyse module just encapsulates all the analysis. You don't need to run it at all by itself. I find it more useful to run post with dry-run set to true and look at the results that way.
Code Reference¶
Module for analyzing toots for a hashtag. Reads a JSON dump of toots presumably written by the fetch() function.
analyse(config)
¶
Does a bunch of analysis over the toots. Returns a dict with the results suitable for sending to post(). The whole process is described in more detail in the methodology documentation.
Parameters¶
- config: A ConfigParser object from the config module
Config Parameters Used¶
| Option | Description |
|---|---|
analyse:botusername |
Exclude toots from this ID |
analyse:journaldir |
Directory to read JSON files from |
analyse:journalfile |
Template for files to read |
analyse:top_n |
How many top toots to report |
analyse:timezone |
What timezone to convert times to |
analyse:tag_users |
Whether we tag users with an @ or not |
Returns¶
Dict that includes a few elements:
- preamble: A bit of information about the analysis. Hashtag and when it was generated.
- num_toots: A few lines of text that describe the analysis: total number of toots, servers, participants, etc.
- most_toots: A line about the person that posted the most toots.
- max_boosts, max_faves, and max_replies: pandas DataFrames that contain the top_n toots in each of these categories.
Source code in mastoscore/analyse.py
def analyse(config: ConfigParser) -> None:
"""
Does a bunch of analysis over the toots. Returns a dict with the results suitable for
sending to [post()](module-post.md). The whole process is described in more detail in
the [methodology documentation](../methodology.md).
# Parameters
- **config**: A ConfigParser object from the [config](module-config.md) module
# Config Parameters Used
| Option | Description |
| ------- | ------- |
| `analyse:botusername` | Exclude toots from this ID |
| `analyse:journaldir` | Directory to read JSON files from |
| `analyse:journalfile` | Template for files to read |
| `analyse:top_n` | How many top toots to report |
| `analyse:timezone` | What timezone to convert times to |
| `analyse:tag_users` | Whether we tag users with an @ or not |
# Returns
Dict that includes a few elements:
- `preamble`: A bit of information about the analysis. Hashtag and when it was generated.
- `num_toots`: A few lines of text that describe the analysis: total number of toots, servers, participants, etc.
- `most_toots`: A line about the person that posted the most toots.
- `max_boosts`, `max_faves`, and `max_replies`: pandas DataFrames that contain the `top_n` toots in each of these categories.
"""
hashtag = config.get('analyse', 'hashtag')
debug = config.getint('analyse', 'debug')
top_n = config.getint('analyse', 'top_n')
timezone = config.get('analyse', 'timezone')
tag_users = config.getboolean('analyse', 'tag_users')
logger = logging.getLogger(__name__)
logging.basicConfig(format='%(levelname)s\t%(message)s')
logger.setLevel(debug)
df = get_toots_df(config)
if len(df) <= 0:
return None
earliest, latest = get_dates(config)
analysis = dict()
# some old data files don't have data for fields we expect
df = df.replace(nan, None)
# top poster
most_toots_id = df['userid'].value_counts().idxmax()
most_toots_name = df.loc[df['userid'] == most_toots_id][:1]['account.display_name'].values[0]
most_toots_count = len(df.loc[df['userid'] == most_toots_id])
# Some overall statistics
num_servers = df['server'].nunique()
max_server = df['server'].value_counts().idxmax()
max_server_toots = len(df.loc[df['server'] == max_server])
# do the max_boosts stuff last because it is destructive. I remove selected toots
# from the dataframe so that they can't appear twice. i.e., if you're the most
# boosted toot, you're taken out of the running for most favourites and most replies,
# even if you DO have the most favourites and most replies.
maxdf = df.copy(deep=True)
max_boosts = maxdf.sort_values(
by=['reblogs_count', 'favourites_count', 'replies_count'], ascending=False).head(top_n)
# drop from df all the toots that are in the max_boosts df
maxdf.drop(maxdf[maxdf['uri'].isin(max_boosts['uri'])].index, inplace=True)
max_faves = maxdf.sort_values(
by=['favourites_count', 'reblogs_count', 'replies_count'], ascending=False).head(top_n)
# drop from df all the toots that are in the max_faves df
maxdf.drop(maxdf[maxdf['uri'].isin(max_faves['uri'])].index, inplace=True)
# Count how many replies to each post are from the post's original author
# Group by the original post ID and count replies where the author is the same
if 'in_reply_to_account_id' in df.columns and 'account.id' in df.columns:
# Create a copy of the dataframe to work with for calculating external replies
replies_df = df.copy()
logger.debug(f"removing self-replies")
# Create a new column for external replies (total replies minus self-replies)
# First, identify self-replies (where author replies to their own post)
self_replies = replies_df[replies_df['in_reply_to_account_id'] == replies_df['account.id']]
# Count self-replies per original post
self_reply_counts = self_replies.groupby('in_reply_to_account_id').size().reset_index(name='self_reply_count')
# Merge this count back to the main dataframe
replies_df = replies_df.merge(self_reply_counts, left_on='account.id', right_on='in_reply_to_account_id', how='left')
# Fill NaN values with 0 (posts with no self-replies)
replies_df['self_reply_count'] = replies_df['self_reply_count'].fillna(0)
# Calculate external replies (total replies minus self-replies)
replies_df['external_replies_count'] = replies_df['replies_count'] - replies_df['self_reply_count']
# Sort by external replies count instead of total replies
max_replies = replies_df.sort_values(
by=['external_replies_count', 'reblogs_count', 'favourites_count'], ascending=False).head(top_n)
print(replies_df[['id', 'replies_count', 'self_reply_count', 'external_replies_count']])
else:
# Fallback to original behavior if we don't have the necessary columns
logger.debug(f"NOT removing self-replies")
logger.debug(df.columns.tolist())
max_replies = df.sort_values(
by=['replies_count', 'reblogs_count', 'favourites_count'], ascending=False).head(top_n)
# Prepare the analysis
# convert config strings into datetime structs
tag = "@" if tag_users else ""
timezone = pytimezone(timezone)
start_time = earliest.strftime("%a %e %b %Y %H:%M %Z")
end_time = latest.strftime("%a %e %b %Y %H:%M %Z")
right_now = datetime.datetime.now(
tz=timezone).strftime("%a %e %b %Y %H:%M %Z")
analysis['preamble'] = f"<p>Summary of #{hashtag} generated at {right_now}.</p>"
analysis['num_toots'] = f"We looked at {len(df)} toots posted between {start_time} and "\
f"{end_time} by {df['userid'].nunique()} " +\
f"different participants across {num_servers} different servers. {max_server} " +\
f"contributed the most toots at {max_server_toots}"
analysis['most_toots'] = f"Most toots were from '{most_toots_name}' ({tag}{most_toots_id}) who posted {most_toots_count}"
analysis['max_boosts'] = max_boosts.to_dict(orient='records', )
analysis['max_faves'] = max_faves.to_dict(orient='records')
analysis['max_replies'] = max_replies.to_dict(orient='records')
analysis['unique_ids'] = df['userid'].nunique()
analysis['top_n'] = top_n
analysis['hashtag'] = hashtag
analysis['top_n'] = top_n
analysis['generated'] = right_now
analysis['event_start'] = start_time
analysis['gross_toots'] = len(df)
analysis['event_end'] = end_time
analysis['num_servers'] = num_servers
analysis['max_server'] = {}
analysis['max_server']['name'] = max_server
analysis['max_server']['num'] = max_server_toots
analysis['most_posts'] = {}
analysis['most_posts']['name'] = most_toots_name
analysis['most_posts']['id'] = most_toots_id
analysis['most_posts']['count'] = most_toots_count
write_json(config, 'analysis', analysis)
get_dates(config)
¶
Given a config, it does a bunch of timezone math and returns a list of two datetime objects, the earliest and latest dates we analyse. Right now there's a hard-coded 1-hour time that is subtracted from the start and added to the end. E.g., if you sent start_time = 12:00 and end_time = 14:00, this function currently returns 11:00 and 15:00 as the two times.
Parameters¶
- config: A ConfigParser object from the config module
Config Parameters Used¶
analyse:start_time: start time for the eventanalyse:end_time: end time for the eventanalyse:timezone: the time zone to localise all the times toanalyse:hours_margin: How many days to go back for analysis. You might have fetched toots from days ago, but then want to restrict analysis to toots from today.
Returns¶
List of two datetime objects: the earliest possible time and latest possible time
Source code in mastoscore/analyse.py
def get_dates(config):
"""
Given a config, it does a bunch of timezone math and returns a list of
two datetime objects, the earliest and latest dates we analyse. Right now
there's a hard-coded 1-hour time that is subtracted from the start and added
to the end. E.g., if you sent start_time = 12:00 and end_time = 14:00, this function
currently returns 11:00 and 15:00 as the two times.
# Parameters
- **config**: A ConfigParser object from the [config](module-config.md) module
# Config Parameters Used
- `analyse:start_time`: start time for the event
- `analyse:end_time`: end time for the event
- `analyse:timezone`: the time zone to localise all the times to
- `analyse:hours_margin`: How many days to go back for analysis. You might have
fetched toots from days ago, but then want to restrict analysis to toots from today.
# Returns
List of two `datetime` objects: the earliest possible time and latest possible time
"""
start_time = config.get('analyse', 'start_time')
end_time = config.get('analyse', 'end_time')
timezone = config.get('analyse', 'timezone')
hours_margin = config.getint('analyse', 'hours_margin')
timezone = pytimezone(timezone)
start_time = datetime.datetime.fromisoformat(
start_time).astimezone(tz=timezone)
end_time = datetime.datetime.fromisoformat(
end_time).astimezone(tz=timezone)
earliest = start_time - datetime.timedelta(hours=hours_margin)
latest = end_time + datetime.timedelta(hours=hours_margin)
return [earliest, latest]
get_toots_df(config)
¶
Opens the journal files from a hierarchical directory structure, parses the toots, and does a bunch of analysis over the toots. Returns a df with the results. This is its own method because the graph() module calls it.
Parameters¶
- config: A ConfigParser object from the config module
Config Parameters Used¶
analyse:botusername: Exclude toots from this IDanalyse:journaldir: Base directory to read JSON files fromanalyse:journalfile: Template for files to readanalyse:top_n: How many top toots to reportmastoscore:event_year: Year of the event (YYYY)mastoscore:event_month: Month of the event (MM)mastoscore:event_day: Day of the event (DD)
Returns¶
Pandas DataFrame with all the toots pulled in and converted to normalised types.
Source code in mastoscore/analyse.py
def get_toots_df(config) -> pd.DataFrame:
"""
Opens the journal files from a hierarchical directory structure, parses the toots,
and does a bunch of analysis over the toots. Returns a df with the results.
This is its own method because the graph() module calls it.
# Parameters
- **config**: A ConfigParser object from the [config](module-config.md) module
# Config Parameters Used
- `analyse:botusername`: Exclude toots from this ID
- `analyse:journaldir`: Base directory to read JSON files from
- `analyse:journalfile`: Template for files to read
- `analyse:top_n`: How many top toots to report
- `mastoscore:event_year`: Year of the event (YYYY)
- `mastoscore:event_month`: Month of the event (MM)
- `mastoscore:event_day`: Day of the event (DD)
# Returns
Pandas DataFrame with all the toots pulled in and converted to normalised types.
"""
botusername = config.get('analyse', 'botusername')
journaldir = config.get('analyse', 'journaldir')
journalfile = config.get('analyse', 'journalfile')
debug = config.getint('analyse', 'debug')
top_n = config.getint('analyse', 'top_n')
logger = logging.getLogger(__name__)
logging.basicConfig(format='%(levelname)s\t%(message)s')
logger.setLevel(debug)
# Get date components from config
try:
year = config.get('mastoscore', 'event_year')
month = config.get('mastoscore', 'event_month')
day = config.get('mastoscore', 'event_day')
date_path = os.path.join(year, month, day)
logger.info(f"Looking for journal files in date path: {date_path}")
except Exception as e:
logger.error(f"Failed to get date components from config: {e}")
logger.error("Falling back to flat directory structure")
date_path = ""
df = pd.DataFrame([])
# journal is now a template. Read all the matching files into a big data frame
max_toots = 0
max_toots_file = "none"
nfiles = 0
# Build the path to search for journal files
if date_path:
search_path = os.path.join(journaldir, date_path)
p = Path(search_path).resolve()
if not p.exists():
logger.error(f"Directory {search_path} does not exist")
# Try falling back to the base directory
logger.info(f"Falling back to base directory: {journaldir}")
p = Path(journaldir).resolve()
# Look for files in the hierarchical structure
pattern = f"**/{journalfile}-*.json"
else:
pattern = f"{journalfile}-*.json"
else:
p = Path(journaldir).resolve()
# Look for files in the hierarchical structure
pattern = f"**/{journalfile}-*.json"
logger.info(f"Searching for files matching pattern: {pattern} in {p}")
filelist = list(p.glob(pattern))
if not filelist:
logger.warning(f"No files found matching pattern {pattern} in {p}")
# Try a more general search if specific path failed
if date_path:
logger.info("Trying broader search in entire journal directory")
p = Path(journaldir).resolve()
filelist = list(p.glob(f"**/{journalfile}-*.json"))
for jfile in filelist:
try:
logger.debug(f"Attempting to read {jfile}")
newdf = pd.read_json(jfile)
except Exception as e:
logger.critical(f"Failed to open {jfile}")
logger.critical(e)
continue
if len(newdf) > max_toots:
max_toots_file = jfile
max_toots = len(newdf)
nfiles = nfiles+1
df = pd.concat([df, newdf])
logger.debug(f"Loaded {len(newdf)} toots from {jfile.name}")
del newdf
logger.info(f"Loaded {len(df)} total toots from {nfiles} JSON files")
logger.info(f"Biggest was {max_toots} toots from {max_toots_file}")
assert(len(df) > 0)
# Now exclude toots that are too old or too new
earliest, latest = get_dates(config)
df = df.loc[df['created_at'] >= earliest]
df = df.loc[df['created_at'] <= latest]
# gather up the set we want to work on
# 1. local toots
# 2. remote toots where we didn't get a local version
local_toots = df.loc[df['local'] == True]
sources = local_toots['source'].unique()
non_local_toots = df.loc[df['server'] != df['source']]
# drop all toots from servers we successfully contacted
non_local_toots = non_local_toots.loc[~non_local_toots['server'].isin(
sources)]
# There will be more than one copy of non-local toots.
# Iterate over each uri, find the copy of it that has the highest numbers
# and keep it, deleting the others
non_local_keepers = pd.DataFrame([])
for uri in non_local_toots['uri'].unique():
minidf = non_local_toots[non_local_toots['uri'] == uri]
# logger.debug(f"{len(minidf)} toots for {uri}")
minidf = minidf.sort_values(
by=['reblogs_count', 'favourites_count', 'replies_count'], ascending=False).head(1)
non_local_keepers = pd.concat([non_local_keepers, minidf])
logger.info(
f"{len(local_toots)} local toots and {len(non_local_keepers)} non-local toots")
df = pd.concat([local_toots, non_local_keepers])
orig_len = len(df)
df = df[df['userid'] != botusername]
final_len = len(df)
logger.debug(
f"Excluded {orig_len-final_len} posts from ourselves ('{botusername}')")
# Quick check to make sure we don't have duplicates. Number of rows in the final
# DataFrame and the number of unique URIs should be the same. If they're not, we
# have duplicates somewhere.
num_unique = len(df['uri'].unique())
if num_unique != final_len:
logger.error(
f"We have {final_len} toots, but {num_unique} URIs. Likely duplicates!")
else:
logger.debug(
f"Number of unique URIs ({num_unique}) == Number of rows ({final_len}). All good.")
return df