|
@@ -0,0 +1,115 @@
|
|
1
|
+# ------------------------------------------------
|
|
2
|
+# Imports
|
|
3
|
+# ------------------------------------------------
|
|
4
|
+
|
|
5
|
+from matplotlib import pyplot as plt
|
|
6
|
+import os
|
|
7
|
+import json
|
|
8
|
+import re
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+# ------------------------------------------------
|
|
13
|
+# Constants
|
|
14
|
+# ------------------------------------------------
|
|
15
|
+
|
|
16
|
+DATA_PATH = 'D:/Files/Data/Messenger/'
|
|
17
|
+
|
|
18
|
+OTHER_LABEL = 'Les Autres'
|
|
19
|
+
|
|
20
|
+# JSON tags
|
|
21
|
+PARTICIPANTS = 'participants'
|
|
22
|
+MESSAGES = 'messages'
|
|
23
|
+NAME = 'name'
|
|
24
|
+CONTENT = 'content'
|
|
25
|
+TIMESTAMP = 'timestamp_ms'
|
|
26
|
+SENDER = 'sender_name'
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+# ------------------------------------------------
|
|
30
|
+# Functions
|
|
31
|
+# ------------------------------------------------
|
|
32
|
+
|
|
33
|
+def readBrokenFbJson(datafile_path):
|
|
34
|
+ # ntm facebook
|
|
35
|
+ # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
|
|
36
|
+ with open(datafile_path, 'rb') as data_file:
|
|
37
|
+ binary_data = data_file.read()
|
|
38
|
+ replace_func = lambda m: bytes.fromhex(m.group(1).decode())
|
|
39
|
+ pattern = rb'\\u00([\da-f]{2})'
|
|
40
|
+
|
|
41
|
+ repaired = re.sub(pattern, replace_func, binary_data)
|
|
42
|
+ return json.loads(repaired.decode('utf8'))
|
|
43
|
+
|
|
44
|
+def computeData():
|
|
45
|
+ # Tous les fichiers du dossier sont traités sans validation
|
|
46
|
+ datafiles_path = [DATA_PATH + filename for filename in os.listdir(DATA_PATH) if filename [-4:] == "json"]
|
|
47
|
+ print(datafiles_path)
|
|
48
|
+ messages, participants = [], []
|
|
49
|
+
|
|
50
|
+ for datafile_path in datafiles_path:
|
|
51
|
+ datacontent = readBrokenFbJson(datafile_path)
|
|
52
|
+ if datacontent is None : continue
|
|
53
|
+
|
|
54
|
+ participants += datacontent[PARTICIPANTS]
|
|
55
|
+ messages += datacontent[MESSAGES]
|
|
56
|
+
|
|
57
|
+ participants = cleanParticipants(participants)
|
|
58
|
+ messages = cleanMessages(messages)
|
|
59
|
+ return participants, messages
|
|
60
|
+
|
|
61
|
+def cleanParticipants(rawParticipants):
|
|
62
|
+ return set([participant[NAME] for participant in rawParticipants])
|
|
63
|
+
|
|
64
|
+def cleanMessages(rawMessages):
|
|
65
|
+ cleanMessages = [message for message in rawMessages if CONTENT in message]
|
|
66
|
+ return sorted(cleanMessages, key = lambda x: x[TIMESTAMP])
|
|
67
|
+
|
|
68
|
+# TODO tester l'approche en recherche incrémentale
|
|
69
|
+# Jeu de données du 14/10/2021. 33679 messages conservés
|
|
70
|
+# pour un compte final de 34120. Soit une perte estimée à 1.3%
|
|
71
|
+def filterMessages(messages):
|
|
72
|
+ return [msg for msg in messages if re.search('(\d{2,}|^\d$)', msg[CONTENT])]
|
|
73
|
+
|
|
74
|
+def computeParticipation(messages):
|
|
75
|
+ result = {}
|
|
76
|
+
|
|
77
|
+ for message in messages:
|
|
78
|
+ sender = message[SENDER]
|
|
79
|
+ result[sender] = result[sender]+1 if sender in result else 1
|
|
80
|
+
|
|
81
|
+ return sorted(result.items(), key = lambda x: x[1])
|
|
82
|
+
|
|
83
|
+def mergeSmallParticipation(rawParticipation, threshold = 1):
|
|
84
|
+ values = [e[1] for e in participation]
|
|
85
|
+ labels = [e[0] for e in participation]
|
|
86
|
+
|
|
87
|
+ totalValues = sum(values)
|
|
88
|
+
|
|
89
|
+ for idx, value in enumerate(values):
|
|
90
|
+ if 100 * value / totalValues >= threshold: break
|
|
91
|
+
|
|
92
|
+ return [sum(values[0:idx])] + values[idx:], [OTHER_LABEL] + labels[idx:]
|
|
93
|
+
|
|
94
|
+def displayParticipation(participation):
|
|
95
|
+ values, labels = mergeSmallParticipation(participation)
|
|
96
|
+
|
|
97
|
+ plt.figure(figsize=(8,7), tight_layout = True)
|
|
98
|
+ plt.pie(values,
|
|
99
|
+ startangle = 90,
|
|
100
|
+ counterclock = False,
|
|
101
|
+ labels = labels,
|
|
102
|
+ rotatelabels = True)
|
|
103
|
+ plt.show()
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+# ------------------------------------------------
|
|
107
|
+# Main Code
|
|
108
|
+# ------------------------------------------------
|
|
109
|
+
|
|
110
|
+participants, messages = computeData()
|
|
111
|
+messages = filterMessages(messages)
|
|
112
|
+
|
|
113
|
+participation = computeParticipation(messages)
|
|
114
|
+displayParticipation(participation)
|
|
115
|
+
|