Kaynağa Gözat

Modification en profondeur du facebook parser

et enrichissement des classes modèle

+ ajout d'un dossier output dans le gitignore
pull/5/head
Figg 8 ay önce
ebeveyn
işleme
da40ab8b96

+ 2
- 0
.gitignore Dosyayı Görüntüle

@@ -1,5 +1,7 @@
1 1
 *.pyc
2 2
 
3
+output/*
4
+
3 5
 # Packages
4 6
 *.egg
5 7
 !/tests/**/*.egg

+ 16
- 4
million/model/fb_export.py Dosyayı Görüntüle

@@ -1,11 +1,23 @@
1
-
2
-from typing import List
1
+from typing import Any, List
3 2
 from pydantic import BaseModel
4
-from million.model.message import Message
5 3
 
4
+from million.model.message import Message
6 5
 from million.model.participant import Participant
7 6
 
7
+class Image(BaseModel):
8
+    creation_timestamp: int
9
+    uri: str
10
+
11
+class JoinableMode(BaseModel):
12
+    mode: int
13
+    link: str
8 14
 
9 15
 class FacebookExport(BaseModel):
10 16
     messages: List[Message]
11
-    participants: List[Participant]
17
+    participants: List[Participant]
18
+    title: str
19
+    is_still_participant: bool
20
+    thread_path: str
21
+    magic_words: List[Any]
22
+    image: Image
23
+    joinable_mode: JoinableMode

+ 44
- 4
million/model/message.py Dosyayı Görüntüle

@@ -1,14 +1,54 @@
1
-
1
+from datetime import datetime
2 2
 from math import floor
3
-from typing import Optional
3
+from typing import Any, List, Optional
4 4
 from pydantic import BaseModel
5 5
 
6
+class Reaction(BaseModel):
7
+    reaction: str
8
+    actor: str
9
+
10
+class AudioFile(BaseModel):
11
+    uri: str
12
+    creation_timestamp: int
13
+
14
+class Video(BaseModel):
15
+    uri: str
16
+    creation_timestamp: int
17
+
18
+class Photo(BaseModel):
19
+    uri: str
20
+    creation_timestamp: int
21
+
22
+class Gif(BaseModel):
23
+    uri: str
24
+
25
+class Share(BaseModel):
26
+    link: str
27
+    share_text: str
28
+
29
+class Sticker(BaseModel):
30
+    uri: str
31
+    ai_stickers: List[Any]
6 32
 
7 33
 class Message(BaseModel):
8 34
     sender_name: str
9 35
     timestamp_ms: int
10
-    content: Optional[str] = None
11
-    is_geoblocked_for_viewer: Optional[bool] = None
36
+    content: str | None = None
37
+    sticker: Sticker | None = None
38
+    share: Share | None = None
39
+    photos: List[Photo] | None = None
40
+    videos: List[Video] | None = None
41
+    gifs: List[Gif] | None = None
42
+    audio_files: List[AudioFile] | None = None
43
+    call_duration: int | None = None
44
+    reactions: List[Reaction] | None = None
45
+    is_unsent: bool | None = None
46
+    is_geoblocked_for_viewer: bool
47
+
48
+    def __str__(self) -> str:
49
+        dt = datetime.fromtimestamp(self.timestamp_ms / 1000)
50
+        dt_str = dt.strftime("%d/%m/%Y, %H:%M:%S")
51
+        return f"{self.sender_name}({dt_str}) : {self.content}"
12 52
 
13 53
     def get_counted_value(self):
14 54
         """

+ 4
- 1
million/model/participant.py Dosyayı Görüntüle

@@ -3,4 +3,7 @@ from pydantic import BaseModel
3 3
 
4 4
 
5 5
 class Participant(BaseModel):
6
-    name: str
6
+    name: str
7
+
8
+    def __hash__(self):
9
+        return hash(self.name)

+ 39
- 33
million/parse/fb_exports.py Dosyayı Görüntüle

@@ -1,37 +1,43 @@
1 1
 
2
-import json
3
-import os
4
-import re
2
+import json, os, re
5 3
 from typing import List
6 4
 
7 5
 from million.model.fb_export import FacebookExport
8
-from million.model.message import Message
9
-
10
-
11
-class FacebookExportParser:
12
-
13
-    def __init__(self):
14
-        pass
15
-
16
-    def parse(self, file_dir) -> FacebookExport:
17
-        files = [file_dir +
18
-                 f for f in os.listdir(file_dir) if f.endswith('.json')]
19
-        messages = []
20
-        participants = []
21
-        for file in files:
22
-            print(file)
23
-            with open(file, 'rb') as f:
24
-                json_data = self.__read_broken_fb_json(f.read())
25
-                messages += [Message(**m) for m in json_data['messages']]
26
-                participants += json_data['participants']
27
-
28
-        messages.sort(key=lambda m: m.timestamp_ms)
29
-        return FacebookExport(messages=messages, participants=participants)
30
-
31
-    def __read_broken_fb_json(self, binary_data):
32
-        repaired = re.sub(
33
-            rb'\\u00([\da-f]{2})',
34
-            lambda m: bytes.fromhex(m.group(1).decode()),
35
-            binary_data
36
-        )
37
-        return json.loads(repaired.decode('utf8'))
6
+
7
+
8
+def is_file_valid(file_name: str) -> bool:
9
+    return os.path.splitext(file_name)[-1].lower() == '.json'
10
+
11
+def valid_dirfiles(file_dir: str) -> List[str]:
12
+    return [os.path.join(file_dir, file_name)
13
+            for file_name in os.listdir(file_dir)
14
+            if is_file_valid(file_name)]
15
+
16
+def parse_file(file_name: str) -> FacebookExport:
17
+    if not is_file_valid(file_name): return None
18
+
19
+    with open(file_name, 'rb') as f:
20
+        fixed_json = __read_broken_fb_json(f.read())
21
+        json_data = json.loads(fixed_json)
22
+        return (FacebookExport(**json_data))
23
+
24
+def parse_dirfiles(file_dir: str) -> FacebookExport:
25
+    exports = [parse_file(f) for f in valid_dirfiles(file_dir)]
26
+    if len(exports) == 0: return
27
+
28
+    for other in exports[1:]:
29
+        exports[0].messages.extend(other.messages)
30
+        exports[0].participants.extend(other.participants)
31
+
32
+    exports[0].messages.sort(key = lambda m: m.timestamp_ms)
33
+    exports[0].participants = set(exports[0].participants)
34
+    return exports[0]
35
+
36
+def __read_broken_fb_json(binary_data):
37
+    repaired = re.sub(
38
+        rb'\\u00([\da-f]{2})',
39
+        lambda m: bytes.fromhex(m.group(1).decode()),
40
+        binary_data
41
+    )
42
+
43
+    return repaired.decode('utf8')

Loading…
İptal
Kaydet