|
@@ -1,37 +1,43 @@
|
1
|
1
|
|
2
|
|
-import json
|
3
|
|
-import os
|
4
|
|
-import re
|
|
2
|
+import json, os, re
|
5
|
3
|
from typing import List
|
6
|
4
|
|
7
|
5
|
from million.model.fb_export import FacebookExport
|
8
|
|
-from million.model.message import Message
|
9
|
|
-
|
10
|
|
-
|
11
|
|
-class FacebookExportParser:
|
12
|
|
-
|
13
|
|
- def __init__(self):
|
14
|
|
- pass
|
15
|
|
-
|
16
|
|
- def parse(self, file_dir) -> FacebookExport:
|
17
|
|
- files = [file_dir +
|
18
|
|
- f for f in os.listdir(file_dir) if f.endswith('.json')]
|
19
|
|
- messages = []
|
20
|
|
- participants = []
|
21
|
|
- for file in files:
|
22
|
|
- print(file)
|
23
|
|
- with open(file, 'rb') as f:
|
24
|
|
- json_data = self.__read_broken_fb_json(f.read())
|
25
|
|
- messages += [Message(**m) for m in json_data['messages']]
|
26
|
|
- participants += json_data['participants']
|
27
|
|
-
|
28
|
|
- messages.sort(key=lambda m: m.timestamp_ms)
|
29
|
|
- return FacebookExport(messages=messages, participants=participants)
|
30
|
|
-
|
31
|
|
- def __read_broken_fb_json(self, binary_data):
|
32
|
|
- repaired = re.sub(
|
33
|
|
- rb'\\u00([\da-f]{2})',
|
34
|
|
- lambda m: bytes.fromhex(m.group(1).decode()),
|
35
|
|
- binary_data
|
36
|
|
- )
|
37
|
|
- return json.loads(repaired.decode('utf8'))
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+def is_file_valid(file_name: str) -> bool:
|
|
9
|
+ return os.path.splitext(file_name)[-1].lower() == '.json'
|
|
10
|
+
|
|
11
|
+def valid_dirfiles(file_dir: str) -> List[str]:
|
|
12
|
+ return [os.path.join(file_dir, file_name)
|
|
13
|
+ for file_name in os.listdir(file_dir)
|
|
14
|
+ if is_file_valid(file_name)]
|
|
15
|
+
|
|
16
|
+def parse_file(file_name: str) -> FacebookExport:
|
|
17
|
+ if not is_file_valid(file_name): return None
|
|
18
|
+
|
|
19
|
+ with open(file_name, 'rb') as f:
|
|
20
|
+ fixed_json = __read_broken_fb_json(f.read())
|
|
21
|
+ json_data = json.loads(fixed_json)
|
|
22
|
+ return (FacebookExport(**json_data))
|
|
23
|
+
|
|
24
|
+def parse_dirfiles(file_dir: str) -> FacebookExport:
|
|
25
|
+ exports = [parse_file(f) for f in valid_dirfiles(file_dir)]
|
|
26
|
+ if len(exports) == 0: return
|
|
27
|
+
|
|
28
|
+ for other in exports[1:]:
|
|
29
|
+ exports[0].messages.extend(other.messages)
|
|
30
|
+ exports[0].participants.extend(other.participants)
|
|
31
|
+
|
|
32
|
+ exports[0].messages.sort(key = lambda m: m.timestamp_ms)
|
|
33
|
+ exports[0].participants = set(exports[0].participants)
|
|
34
|
+ return exports[0]
|
|
35
|
+
|
|
36
|
+def __read_broken_fb_json(binary_data):
|
|
37
|
+ repaired = re.sub(
|
|
38
|
+ rb'\\u00([\da-f]{2})',
|
|
39
|
+ lambda m: bytes.fromhex(m.group(1).decode()),
|
|
40
|
+ binary_data
|
|
41
|
+ )
|
|
42
|
+
|
|
43
|
+ return repaired.decode('utf8')
|