DemiSel
/
Messenger_1Million_Stats


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
							# ------------------------------------------------
#                    Imports
# ------------------------------------------------

from matplotlib import pyplot as plt
import os
import json
import re
import sys, getopt


# ------------------------------------------------
#                   Globals
# ------------------------------------------------

DATA_PATH = './data/'

OTHER_LABEL = 'Les Autres'

# JSON tags
PARTICIPANTS = 'participants'
MESSAGES = 'messages'
NAME = 'name'
CONTENT = 'content'
TIMESTAMP = 'timestamp_ms'
SENDER = 'sender_name'

HELP = """General options :
    -h, --help          Consulter l'aide
    --path=<path>       Redéfinir le chemin d'accès aux données (par défaut ./data)
    """

# ------------------------------------------------
#                   Functions
# ------------------------------------------------

def handleArguments(argv):
    try:
        opts, args = getopt.getopt(argv, 'h',['help','path='])
    except getopt.GetoptError:
        print('Usage:\n '+os.path.basename(__file__)+' <command> [option]\n')
        print(HELP)
        sys.exit(2)

    for opt, arg in opts:
        if opt in ('-h', '--help'):
            print(HELP)
            sys.exit()
        elif opt in ('--path'):
            DATA_PATH = arg

def readBrokenFbJson(datafile_path):
    # ntm facebook
    # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
    with open(datafile_path, 'rb') as data_file:
        binary_data = data_file.read()
        replace_func = lambda m: bytes.fromhex(m.group(1).decode())
        pattern = rb'\\u00([\da-f]{2})'
    
        repaired = re.sub(pattern, replace_func, binary_data)
        return json.loads(repaired.decode('utf8'))

def computeData():
    # Tous les fichiers du dossier sont traités sans distinction
    datafiles_path = [DATA_PATH + filename for filename in os.listdir(DATA_PATH)]
    messages, participants = [], []

    print(datafiles_path)
    
    for datafile_path in datafiles_path:
        datacontent = readBrokenFbJson(datafile_path)
        if datacontent is None : continue

        participants += datacontent[PARTICIPANTS]
        messages += datacontent[MESSAGES]

    participants = cleanParticipants(participants)
    messages = cleanMessages(messages)
    return participants, messages

def cleanParticipants(rawParticipants):
    return set([participant[NAME] for participant in rawParticipants])

def cleanMessages(rawMessages):
    cleanMessages = [message for message in rawMessages if CONTENT in message]
    return sorted(cleanMessages, key = lambda x: x[TIMESTAMP])

# TODO tester l'approche en recherche incrémentale
# Jeu de données du 14/10/2021. 33679 messages conservés
# pour un compte final de 34120. Soit une perte estimée à 1.3%
def filterMessages(messages):
    return [msg for msg in messages if re.search('(\d{2,}|^\d$)', msg[CONTENT])]

def computeParticipation(messages):
    result = {}

    for message in messages:
        sender = message[SENDER]
        result[sender] = result[sender]+1 if sender in result else 1

    return sorted(result.items(), key = lambda x: x[1])

def mergeSmallParticipation(rawParticipation, threshold = 1):
    values = [e[1] for e in rawParticipation]
    labels = [e[0] for e in rawParticipation]

    totalValues = sum(values)

    for idx, value in enumerate(values):
        if 100 * value / totalValues >= threshold: break
        
    return [sum(values[0:idx])] + values[idx:], [OTHER_LABEL] + labels[idx:]

def displayParticipation(participation):
    values, labels = mergeSmallParticipation(participation)

    plt.figure(figsize=(8,7), tight_layout = True)
    plt.pie(values,
            startangle = 90,
            counterclock = False,
            labels = labels,
            rotatelabels = True)
    plt.show()


# ------------------------------------------------
#                   Main Code
# ------------------------------------------------

def main(argv):
    handleArguments(argv)

    participants, messages = computeData()
    messages = filterMessages(messages)

    participation = computeParticipation(messages)
    displayParticipation(participation)

if __name__ == "__main__":
    main(sys.argv[1:])