Explorar el Código

Ajout des données JSON facebook messenger

+ les dataclasses et le module de parsing
feature/use_fb_data
Figg hace 4 meses
padre
commit
3e6d1b43e5

+ 61288
- 0
data/fb_export/message_1.json
La diferencia del archivo ha sido suprimido porque es demasiado grande
Ver fichero


+ 61437
- 0
data/fb_export/message_2.json
La diferencia del archivo ha sido suprimido porque es demasiado grande
Ver fichero


+ 61231
- 0
data/fb_export/message_3.json
La diferencia del archivo ha sido suprimido porque es demasiado grande
Ver fichero


+ 61532
- 0
data/fb_export/message_4.json
La diferencia del archivo ha sido suprimido porque es demasiado grande
Ver fichero


+ 62184
- 0
data/fb_export/message_5.json
La diferencia del archivo ha sido suprimido porque es demasiado grande
Ver fichero


+ 61635
- 0
data/fb_export/message_6.json
La diferencia del archivo ha sido suprimido porque es demasiado grande
Ver fichero


+ 61443
- 0
data/fb_export/message_7.json
La diferencia del archivo ha sido suprimido porque es demasiado grande
Ver fichero


+ 38016
- 0
data/fb_export/message_8.json
La diferencia del archivo ha sido suprimido porque es demasiado grande
Ver fichero


+ 291
- 0
python/model/facebook_export.py Ver fichero

@@ -0,0 +1,291 @@
1
+from datetime import datetime
2
+from typing import Any, List, Optional, Self
3
+from uuid import uuid4
4
+
5
+class AudioFile:
6
+    uri: str
7
+    creation_timestamp: int
8
+
9
+    def __init__(self, uri: str, creation_timestamp: int):
10
+        self.uri = uri
11
+        self.creation_timestamp = creation_timestamp
12
+
13
+    @classmethod
14
+    def from_dict(cls, data: dict) -> Self:
15
+        return cls(
16
+            uri=data['uri'],
17
+            creation_timestamp=data['creation_timestamp']
18
+        )
19
+    
20
+class Gif:
21
+    uri: str
22
+
23
+    def __init__(self, uri: str):
24
+        self.uri = uri
25
+
26
+    @classmethod
27
+    def from_dict(cls, data: dict) -> Self:
28
+        return cls(
29
+            uri=data['uri']
30
+        )
31
+
32
+class Photo:
33
+    uri: str
34
+    creation_timestamp: int
35
+
36
+    def __init__(self, uri: str, creation_timestamp: int):
37
+        self.uri = uri
38
+        self.creation_timestamp = creation_timestamp
39
+
40
+    @classmethod
41
+    def from_dict(cls, data: dict) -> Self:
42
+        return cls(
43
+            uri=data['uri'],
44
+            creation_timestamp=data['creation_timestamp']
45
+        )
46
+    
47
+class Reaction:
48
+    reaction: str
49
+    actor: str
50
+
51
+    def __init__(self, reaction: str, actor: str):
52
+        self.reaction = reaction
53
+        self.actor = actor
54
+
55
+    @classmethod
56
+    def from_dict(cls, data: dict) -> Self:
57
+        return cls(
58
+            reaction=data['reaction'],
59
+            actor=data['actor']
60
+        )
61
+    
62
+class Share:
63
+    link: str
64
+    share_text: str
65
+
66
+    def __init__(self, link: str, share_text: str):
67
+        self.link = link
68
+        self.share_text = share_text
69
+
70
+    @classmethod
71
+    def from_dict(cls, data: dict) -> Self:
72
+        return cls(
73
+            link=data['link'],
74
+            share_text=data['share_text']
75
+        )
76
+
77
+class Sticker:
78
+    uri: str
79
+    ai_stickers: Optional[List[Any]]
80
+
81
+    def __init__(self, uri: str, ai_stickers: Optional[List[Any]] = None):
82
+        self.uri = uri
83
+        self.ai_stickers = ai_stickers if ai_stickers is not None else []
84
+
85
+    @classmethod
86
+    def from_dict(cls, data: dict) -> Self:
87
+        return cls(
88
+            uri=data['uri'],
89
+            ai_stickers=data.get('ai_stickers', [])
90
+        )
91
+    
92
+class Video:
93
+    uri: str
94
+    creation_timestamp: int
95
+
96
+    def __init__(self, uri: str, creation_timestamp: int):
97
+        self.uri = uri
98
+        self.creation_timestamp = creation_timestamp
99
+
100
+    @classmethod
101
+    def from_dict(cls, data: dict) -> Self:
102
+        return cls(
103
+            uri=data['uri'],
104
+            creation_timestamp=data['creation_timestamp']
105
+        )
106
+  
107
+class Message:
108
+    sender_name: str
109
+    date_time: datetime
110
+    content: Optional[str]
111
+    sticker: Optional[Sticker]
112
+    share: Optional[Share]
113
+    photos: Optional[List[Photo]]
114
+    videos: Optional[List[Video]]
115
+    gifs: Optional[List[Gif]]
116
+    audio_files: Optional[List[AudioFile]]
117
+    call_duration: Optional[int]
118
+    reactions: Optional[List[Reaction]]
119
+    is_unsent: Optional[bool]
120
+    is_geoblocked_for_viewer: bool
121
+    _id: str
122
+
123
+    def __init__(self,
124
+                 sender_name: str,
125
+                 date_time: datetime,
126
+                 is_geoblocked_for_viewer: bool,
127
+                 content: Optional[str] = None,
128
+                 sticker: Optional[Sticker] = None,
129
+                 share: Optional[Share] = None,
130
+                 photos: Optional[List[Photo]] = None,
131
+                 videos: Optional[List[Video]] = None,
132
+                 gifs: Optional[List[Gif]] = None,
133
+                 audio_files: Optional[List[AudioFile]] = None,
134
+                 call_duration: Optional[int] = None,
135
+                 reactions: Optional[List[Reaction]] = None,
136
+                 is_unsent: Optional[bool] = None
137
+                 ):
138
+        self.sender_name = sender_name
139
+        self.date_time = date_time
140
+        self.content = content
141
+        self.sticker = sticker
142
+        self.share = share
143
+        self.photos = photos
144
+        self.videos = videos
145
+        self.gifs = gifs
146
+        self.audio_files = audio_files
147
+        self.call_duration = call_duration
148
+        self.reactions = reactions
149
+        self.is_unsent = is_unsent
150
+        self.is_geoblocked_for_viewer = is_geoblocked_for_viewer
151
+        self._id = str(uuid4())
152
+
153
+    @property
154
+    def message_id(self) -> str:
155
+        return self._id
156
+
157
+    @classmethod
158
+    def from_dict(cls, data: dict) -> Self:
159
+        _timestamp_ms = data.get('timestamp_ms')
160
+        _sender_name = data.get('sender_name')
161
+        _is_geoblocked_for_viewer = data.get('is_geoblocked_for_viewer')
162
+        _content = data.get('content')
163
+        _sticker_dict = data.get('sticker')
164
+        _share_dict = data.get('share')
165
+        _photos_dict = data.get('photos')
166
+        _videos_dict = data.get('videos')
167
+        _gifs_dict = data.get('gifs')
168
+        _audio_files_dict = data.get('audio_files')
169
+        _call_duration = data.get('call_duration')
170
+        _reactions_dict = data.get('reactions')
171
+        _is_unsent = data.get('is_unsent')
172
+
173
+        _date_time = datetime.fromtimestamp(_timestamp_ms / 1000)
174
+        _sticker = Sticker.from_dict(_sticker_dict) if _sticker_dict else None
175
+        _share = Share.from_dict(_share_dict) if _share_dict else None
176
+        _photos = [Photo.from_dict(photo) for photo in _photos_dict] if _photos_dict else None
177
+        _videos = [Video.from_dict(video) for video in _videos_dict] if _videos_dict else None
178
+        _gifs = [Gif.from_dict(gif) for gif in _gifs_dict] if _gifs_dict else None
179
+        _audio_files = [AudioFile.from_dict(audio_file) for audio_file in _audio_files_dict] if _audio_files_dict else None
180
+        _reactions = [Reaction.from_dict(reaction) for reaction in _reactions_dict] if _reactions_dict else None
181
+
182
+        return cls(
183
+            sender_name=_sender_name,
184
+            date_time=_date_time,
185
+            is_geoblocked_for_viewer=_is_geoblocked_for_viewer,
186
+            content=_content,
187
+            sticker=_sticker,
188
+            share=_share,
189
+            photos=_photos,
190
+            videos=_videos,
191
+            gifs=_gifs,
192
+            audio_files=_audio_files,
193
+            call_duration=_call_duration,
194
+            reactions=_reactions,
195
+            is_unsent=_is_unsent
196
+        )
197
+ 
198
+class Image:
199
+    creation_timestamp: int
200
+    uri: str
201
+
202
+    def __init__(self, creation_timestamp: int, uri: str):
203
+        self.creation_timestamp = creation_timestamp
204
+        self.uri = uri
205
+
206
+    @classmethod
207
+    def from_dict(cls, data: dict) -> Self:
208
+        return cls(
209
+            creation_timestamp=data['creation_timestamp'],
210
+            uri=data['uri']
211
+        )
212
+
213
+class JoinableMode:
214
+    mode: int
215
+    link: str
216
+
217
+    def __init__(self, mode: int, link: str):
218
+        self.mode = mode
219
+        self.link = link
220
+
221
+    @classmethod
222
+    def from_dict(cls, data: dict) -> Self:
223
+        return cls(
224
+            mode=data['mode'],
225
+            link=data['link']
226
+        )
227
+
228
+class Participant:
229
+    name: str
230
+
231
+    def __init__(self, name: str) -> None:
232
+        self.name = name
233
+
234
+    @classmethod
235
+    def from_dict(cls, data: dict) -> Self:
236
+        return cls(
237
+            name = data.get('name')
238
+        )
239
+
240
+class FacebookExport:
241
+    messages: List[Message]
242
+    participants: List[Participant]
243
+    title: str
244
+    is_still_participant: bool
245
+    thread_path: str
246
+    magic_words: List[Any]
247
+    image: Image
248
+    joinable_mode: JoinableMode
249
+
250
+    def __init__(self,
251
+                 messages: List[Message],
252
+                 participants: List[Participant],
253
+                 title: str,
254
+                 is_still_participant: bool,
255
+                 thread_path: str,
256
+                 magic_words: List[Any],
257
+                 image: Image,
258
+                 joinable_mode: JoinableMode):
259
+        self.messages = messages
260
+        self.participants = participants
261
+        self.title = title
262
+        self.is_still_participant = is_still_participant
263
+        self.thread_path = thread_path
264
+        self.magic_words = magic_words
265
+        self.image = image
266
+        self.joinable_mode = joinable_mode
267
+
268
+    @classmethod
269
+    def from_dict(cls, data: dict) -> Self:
270
+        messages_data = data.get('messages', [])
271
+        participants_data = data.get('participants', [])
272
+        title = data['title']
273
+        is_still_participant = data['is_still_participant']
274
+        thread_path = data['thread_path']
275
+        magic_words = data.get('magic_words', [])
276
+        image = Image.from_dict(data['image'])
277
+        joinable_mode = JoinableMode.from_dict(data['joinable_mode'])
278
+        
279
+        messages = [Message.from_dict(m) for m in messages_data]
280
+        participants = {Participant.from_dict(p) for p in participants_data}
281
+
282
+        return cls(
283
+            messages=messages,
284
+            participants=participants,
285
+            title=title,
286
+            is_still_participant=is_still_participant,
287
+            thread_path=thread_path,
288
+            magic_words=magic_words,
289
+            image=image,
290
+            joinable_mode=joinable_mode
291
+        )

+ 72
- 0
python/parsing/facebook_export.py Ver fichero

@@ -0,0 +1,72 @@
1
+import json
2
+import os
3
+import re
4
+from typing import Any
5
+from model.facebook_export import FacebookExport
6
+
7
+def parse_facebook_exports_from_folder(folder_path: str) -> FacebookExport:
8
+    """
9
+    Parcourt un dossier, lit chaque fichier JSON et les fusionne dans un seul objet FacebookExport.
10
+
11
+    :param folder_path: Chemin vers le dossier contenant les fichiers JSON
12
+    :return: Un objet FacebookExport
13
+    """
14
+    if not os.path.isdir(folder_path):
15
+        raise FileNotFoundError(f"Le dossier {folder_path} n'existe pas.")
16
+    
17
+    json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]
18
+    
19
+    if not json_files:
20
+        raise FileNotFoundError(f"Aucun fichier JSON trouvé dans le dossier {folder_path}.")
21
+
22
+    merged_facebook_export = None
23
+
24
+    for file_name in json_files:
25
+        #file_path = os.path.join(folder_path, file_name)
26
+
27
+        facebook_export = parse_facebook_export(file_name)
28
+
29
+        if merged_facebook_export is None:
30
+            merged_facebook_export = facebook_export
31
+        else:
32
+            merged_facebook_export.messages.extend(facebook_export.messages)
33
+            merged_facebook_export.participants.update(facebook_export.participants)
34
+
35
+    if merged_facebook_export is None:
36
+        raise ValueError(f"Aucun fichier valide n'a été trouvé ou parsé dans le dossier {folder_path}.")
37
+    
38
+    return merged_facebook_export
39
+
40
+def parse_facebook_export(file_name: str) -> FacebookExport:
41
+    """
42
+    Parse un fichier JSON en instance de la classe model.FacebookExport
43
+
44
+    :param file_name: Le chemin du fichier à ouvrir
45
+    :return: Une instance de FacebookExport initialisée avec les données du fichier JSON
46
+    """
47
+    if not file_name.lower().endswith('.json') or not os.path.exists(file_name):
48
+        raise ValueError(f"Le fichier {file_name} n'existe pas ou n'est pas un fichier JSON.")
49
+
50
+    try:
51
+        json_data = read_broken_fb_json(file_name)
52
+        return FacebookExport.from_dict(json_data)
53
+    except Exception as e:
54
+        raise Exception(f"Erreur lors de la conversion du JSON en FacebookExport: {str(e)}")
55
+
56
+def read_broken_fb_json(file_name: str) -> Any:
57
+    """
58
+    Lit un fichier JSON issu de Facebook et renvoie le contenu après correction de l'encodage
59
+
60
+    :param file_name: Le chemin du fichier à ouvrir
61
+    :return: Le contenu du fichier JSON sous forme d'objet
62
+    """
63
+    # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
64
+    with open(file_name, 'rb') as f:
65
+        binary_data = f.read()
66
+        repaired = re.sub(
67
+            rb'\\u00([\da-f]{2})',
68
+            lambda m: bytes.fromhex(m.group(1).decode()),
69
+            binary_data
70
+        )
71
+
72
+        return json.loads(repaired.decode('utf8'))

Loading…
Cancelar
Guardar