浏览代码

iniitial

pull/1/head
DemiSel 2 年前
当前提交
ed4d779fa2
共有 1 个文件被更改,包括 115 次插入0 次删除
  1. 115
    0
      export_data-v2.py

+ 115
- 0
export_data-v2.py 查看文件

@@ -0,0 +1,115 @@
1
+# ------------------------------------------------
2
+#                    Imports
3
+# ------------------------------------------------
4
+
5
+from matplotlib import pyplot as plt
6
+import os
7
+import json
8
+import re
9
+
10
+
11
+
12
+# ------------------------------------------------
13
+#                   Constants
14
+# ------------------------------------------------
15
+
16
+DATA_PATH = 'D:/Files/Data/Messenger/'
17
+
18
+OTHER_LABEL = 'Les Autres'
19
+
20
+# JSON tags
21
+PARTICIPANTS = 'participants'
22
+MESSAGES = 'messages'
23
+NAME = 'name'
24
+CONTENT = 'content'
25
+TIMESTAMP = 'timestamp_ms'
26
+SENDER = 'sender_name'
27
+
28
+
29
+# ------------------------------------------------
30
+#                   Functions
31
+# ------------------------------------------------
32
+
33
+def readBrokenFbJson(datafile_path):
34
+    # ntm facebook
35
+    # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
36
+    with open(datafile_path, 'rb') as data_file:
37
+        binary_data = data_file.read()
38
+        replace_func = lambda m: bytes.fromhex(m.group(1).decode())
39
+        pattern = rb'\\u00([\da-f]{2})'
40
+    
41
+        repaired = re.sub(pattern, replace_func, binary_data)
42
+        return json.loads(repaired.decode('utf8'))
43
+
44
+def computeData():
45
+    # Tous les fichiers du dossier sont traités sans validation
46
+    datafiles_path = [DATA_PATH + filename for filename in os.listdir(DATA_PATH) if filename [-4:] == "json"]
47
+    print(datafiles_path)
48
+    messages, participants = [], []
49
+
50
+    for datafile_path in datafiles_path:
51
+        datacontent = readBrokenFbJson(datafile_path)
52
+        if datacontent is None : continue
53
+
54
+        participants += datacontent[PARTICIPANTS]
55
+        messages += datacontent[MESSAGES]
56
+
57
+    participants = cleanParticipants(participants)
58
+    messages = cleanMessages(messages)
59
+    return participants, messages
60
+
61
+def cleanParticipants(rawParticipants):
62
+    return set([participant[NAME] for participant in rawParticipants])
63
+
64
+def cleanMessages(rawMessages):
65
+    cleanMessages = [message for message in rawMessages if CONTENT in message]
66
+    return sorted(cleanMessages, key = lambda x: x[TIMESTAMP])
67
+
68
+# TODO tester l'approche en recherche incrémentale
69
+# Jeu de données du 14/10/2021. 33679 messages conservés
70
+# pour un compte final de 34120. Soit une perte estimée à 1.3%
71
+def filterMessages(messages):
72
+    return [msg for msg in messages if re.search('(\d{2,}|^\d$)', msg[CONTENT])]
73
+
74
+def computeParticipation(messages):
75
+    result = {}
76
+
77
+    for message in messages:
78
+        sender = message[SENDER]
79
+        result[sender] = result[sender]+1 if sender in result else 1
80
+
81
+    return sorted(result.items(), key = lambda x: x[1])
82
+
83
+def mergeSmallParticipation(rawParticipation, threshold = 1):
84
+    values = [e[1] for e in participation]
85
+    labels = [e[0] for e in participation]
86
+
87
+    totalValues = sum(values)
88
+
89
+    for idx, value in enumerate(values):
90
+        if 100 * value / totalValues >= threshold: break
91
+        
92
+    return [sum(values[0:idx])] + values[idx:], [OTHER_LABEL] + labels[idx:]
93
+
94
+def displayParticipation(participation):
95
+    values, labels = mergeSmallParticipation(participation)
96
+
97
+    plt.figure(figsize=(8,7), tight_layout = True)
98
+    plt.pie(values,
99
+            startangle = 90,
100
+            counterclock = False,
101
+            labels = labels,
102
+            rotatelabels = True)
103
+    plt.show()
104
+
105
+
106
+# ------------------------------------------------
107
+#                   Main Code
108
+# ------------------------------------------------
109
+
110
+participants, messages = computeData()
111
+messages = filterMessages(messages)
112
+
113
+participation = computeParticipation(messages)
114
+displayParticipation(participation)
115
+

正在加载...
取消
保存