DemiSel
/
Messenger_1Million_Stats


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
							# ------------------------------------------------
#                    Imports
# ------------------------------------------------

from matplotlib import pyplot as plt
import os
import json
import re


# ------------------------------------------------
#                   Constants
# ------------------------------------------------

DATA_PATH = 'D:/Files/Data/Messenger/'

OTHER_LABEL = 'Les Autres'

# JSON tags
PARTICIPANTS = 'participants'
MESSAGES = 'messages'
NAME = 'name'
CONTENT = 'content'
TIMESTAMP = 'timestamp_ms'
SENDER = 'sender_name'


# ------------------------------------------------
#                   Functions
# ------------------------------------------------

def readBrokenFbJson(datafile_path):
    # ntm facebook
    # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
    with open(datafile_path, 'rb') as data_file:
        binary_data = data_file.read()
        replace_func = lambda m: bytes.fromhex(m.group(1).decode())
        pattern = rb'\\u00([\da-f]{2})'
    
        repaired = re.sub(pattern, replace_func, binary_data)
        return json.loads(repaired.decode('utf8'))

def computeData():
    # Tous les fichiers du dossier sont traités sans validation
    datafiles_path = [DATA_PATH + filename for filename in os.listdir(DATA_PATH) if filename [-4:] == "json"]
    print(datafiles_path)
    messages, participants = [], []

    for datafile_path in datafiles_path:
        datacontent = readBrokenFbJson(datafile_path)
        if datacontent is None : continue

        participants += datacontent[PARTICIPANTS]
        messages += datacontent[MESSAGES]

    participants = cleanParticipants(participants)
    messages = cleanMessages(messages)
    return participants, messages

def cleanParticipants(rawParticipants):
    return set([participant[NAME] for participant in rawParticipants])

def cleanMessages(rawMessages):
    cleanMessages = [message for message in rawMessages if CONTENT in message]
    return sorted(cleanMessages, key = lambda x: x[TIMESTAMP])

# TODO tester l'approche en recherche incrémentale
# Jeu de données du 14/10/2021. 33679 messages conservés
# pour un compte final de 34120. Soit une perte estimée à 1.3%
def filterMessages(messages):
    return [msg for msg in messages if re.search('(\d{2,}|^\d$)', msg[CONTENT])]

def computeParticipation(messages):
    result = {}

    for message in messages:
        sender = message[SENDER]
        result[sender] = result[sender]+1 if sender in result else 1

    return sorted(result.items(), key = lambda x: x[1])

def mergeSmallParticipation(rawParticipation, threshold = 1):
    values = [e[1] for e in participation]
    labels = [e[0] for e in participation]

    totalValues = sum(values)

    for idx, value in enumerate(values):
        if 100 * value / totalValues >= threshold: break
        
    return [sum(values[0:idx])] + values[idx:], [OTHER_LABEL] + labels[idx:]

def displayParticipation(participation):
    values, labels = mergeSmallParticipation(participation)

    plt.figure(figsize=(8,7), tight_layout = True)
    plt.pie(values,
            startangle = 90,
            counterclock = False,
            labels = labels,
            rotatelabels = True)
    plt.show()


# ------------------------------------------------
#                   Main Code
# ------------------------------------------------

participants, messages = computeData()
messages = filterMessages(messages)

participation = computeParticipation(messages)
displayParticipation(participation)