123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115 |
- # ------------------------------------------------
- # Imports
- # ------------------------------------------------
-
- from matplotlib import pyplot as plt
- import os
- import json
- import re
-
-
-
- # ------------------------------------------------
- # Constants
- # ------------------------------------------------
-
- DATA_PATH = 'D:/Files/Data/Messenger/'
-
- OTHER_LABEL = 'Les Autres'
-
- # JSON tags
- PARTICIPANTS = 'participants'
- MESSAGES = 'messages'
- NAME = 'name'
- CONTENT = 'content'
- TIMESTAMP = 'timestamp_ms'
- SENDER = 'sender_name'
-
-
- # ------------------------------------------------
- # Functions
- # ------------------------------------------------
-
- def readBrokenFbJson(datafile_path):
- # ntm facebook
- # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
- with open(datafile_path, 'rb') as data_file:
- binary_data = data_file.read()
- replace_func = lambda m: bytes.fromhex(m.group(1).decode())
- pattern = rb'\\u00([\da-f]{2})'
-
- repaired = re.sub(pattern, replace_func, binary_data)
- return json.loads(repaired.decode('utf8'))
-
- def computeData():
- # Tous les fichiers du dossier sont traités sans validation
- datafiles_path = [DATA_PATH + filename for filename in os.listdir(DATA_PATH) if filename [-4:] == "json"]
- print(datafiles_path)
- messages, participants = [], []
-
- for datafile_path in datafiles_path:
- datacontent = readBrokenFbJson(datafile_path)
- if datacontent is None : continue
-
- participants += datacontent[PARTICIPANTS]
- messages += datacontent[MESSAGES]
-
- participants = cleanParticipants(participants)
- messages = cleanMessages(messages)
- return participants, messages
-
- def cleanParticipants(rawParticipants):
- return set([participant[NAME] for participant in rawParticipants])
-
- def cleanMessages(rawMessages):
- cleanMessages = [message for message in rawMessages if CONTENT in message]
- return sorted(cleanMessages, key = lambda x: x[TIMESTAMP])
-
- # TODO tester l'approche en recherche incrémentale
- # Jeu de données du 14/10/2021. 33679 messages conservés
- # pour un compte final de 34120. Soit une perte estimée à 1.3%
- def filterMessages(messages):
- return [msg for msg in messages if re.search('(\d{2,}|^\d$)', msg[CONTENT])]
-
- def computeParticipation(messages):
- result = {}
-
- for message in messages:
- sender = message[SENDER]
- result[sender] = result[sender]+1 if sender in result else 1
-
- return sorted(result.items(), key = lambda x: x[1])
-
- def mergeSmallParticipation(rawParticipation, threshold = 1):
- values = [e[1] for e in participation]
- labels = [e[0] for e in participation]
-
- totalValues = sum(values)
-
- for idx, value in enumerate(values):
- if 100 * value / totalValues >= threshold: break
-
- return [sum(values[0:idx])] + values[idx:], [OTHER_LABEL] + labels[idx:]
-
- def displayParticipation(participation):
- values, labels = mergeSmallParticipation(participation)
-
- plt.figure(figsize=(8,7), tight_layout = True)
- plt.pie(values,
- startangle = 90,
- counterclock = False,
- labels = labels,
- rotatelabels = True)
- plt.show()
-
-
- # ------------------------------------------------
- # Main Code
- # ------------------------------------------------
-
- participants, messages = computeData()
- messages = filterMessages(messages)
-
- participation = computeParticipation(messages)
- displayParticipation(participation)
|