Browse Source

fix mineurs

Modification des scripts en fonction du nouveau parser
Ajout d'une classe servant à évaluer la valeur compter d'un message
pull/5/head
Figg 6 months ago
parent
commit
d92dd386a4

+ 4
- 3
million/analyze/find_holes.py View File

@@ -4,19 +4,20 @@ from typing import List
4 4
 from million.model.hole import Hole
5 5
 from million.model.message import Message
6 6
 from million.model.sequence import Sequence
7
+import million.analyze.message_evaluation as msg_ev
7 8
 
8 9
 
9 10
 def compute_sequences(messages: List[Message], accepted_max: int = 1_000_000) -> List[Sequence]:
10 11
     sequences: List[Sequence] = []
11 12
     current_sequence = Sequence(
12
-        start=messages[0].get_counted_value(),
13
+        start = msg_ev.compute(messages[0]),
13 14
         start_message=messages[0],
14
-        end=messages[0].get_counted_value(),
15
+        end = msg_ev.compute(messages[0]),
15 16
         end_message=messages[0]
16 17
     )
17 18
     for i in range(1, len(messages)):
18 19
         message = messages[i]
19
-        message_value = message.get_counted_value()
20
+        message_value = msg_ev.compute(message)
20 21
         if message_value > accepted_max:
21 22
             continue
22 23
         if message_value - current_sequence.end == 1:

+ 18
- 0
million/analyze/message_evaluation.py View File

@@ -0,0 +1,18 @@
1
+from math import floor
2
+from million.model.message import Message
3
+
4
+# TODO WIP
5
+# - DNS to resolve audio, gif, pictures with counts
6
+def compute(msg: Message) -> int:
7
+    """ Returns the estimated value counted in this message
8
+    """
9
+    value = None
10
+    # Remove any number that is not a digit
11
+    # TODO parse potential math expressions in content
12
+    cleaned_content = ''.join([c for c in msg.content if c.isdigit()])
13
+    try:
14
+        value = floor(float(cleaned_content))
15
+    except Exception as e:
16
+        raise ValueError(
17
+            f"Message {cleaned_content} does not contain a number ({e})")
18
+    return value

+ 2
- 3
million/analyze/retain_counts.py View File

@@ -6,11 +6,10 @@ from million.model.message import Message
6 6
 
7 7
 def retain_counts(messages : List[Message])-> List[Message]:
8 8
     """
9
-    Retain only the messages that have a content and a sender_name
9
+    Retain only the messages that have a content
10 10
     """
11 11
     return [
12 12
         m for m in messages 
13
-        if m.content and 
14
-        m.sender_name and
13
+        if m.content and
15 14
         re.search('(\d{2,}|^\d$)', m.content)
16 15
         ]

+ 23
- 3
million/model/fb_export.py View File

@@ -1,4 +1,6 @@
1
-from typing import Any, List
1
+from __future__ import annotations
2
+
3
+from typing import Any, List, Set
2 4
 from pydantic import BaseModel
3 5
 
4 6
 from million.model.message import Message
@@ -8,16 +10,34 @@ class Image(BaseModel):
8 10
     creation_timestamp: int
9 11
     uri: str
10 12
 
13
+    def __eq__(self, other: Image) -> bool:
14
+        return self.creation_timestamp == other.creation_timestamp \
15
+            and self.uri == other.uri
16
+
11 17
 class JoinableMode(BaseModel):
12 18
     mode: int
13 19
     link: str
14 20
 
15 21
 class FacebookExport(BaseModel):
16 22
     messages: List[Message]
17
-    participants: List[Participant]
23
+    participants: Set[Participant]
18 24
     title: str
19 25
     is_still_participant: bool
20 26
     thread_path: str
21
-    magic_words: List[Any]
27
+    magic_words: Set[Any]
22 28
     image: Image
23 29
     joinable_mode: JoinableMode
30
+
31
+    def merge(self, other: FacebookExport) -> None:
32
+        if self == other:
33
+            self.messages.extend(other.messages)
34
+            self.participants.update(other.participants)
35
+            self.magic_words.update(other.magic_words)
36
+
37
+    def sort(self) -> None:
38
+        self.messages.sort(key = lambda m: m.timestamp_ms)
39
+
40
+    # NOTE Toughen equality conditions ?
41
+    def __eq__(self, other: FacebookExport) -> bool:
42
+        return self.title == other.title \
43
+            and self.image == other.image

+ 1
- 16
million/model/message.py View File

@@ -1,6 +1,6 @@
1 1
 from datetime import datetime
2 2
 from math import floor
3
-from typing import Any, List, Optional
3
+from typing import Any, List
4 4
 from pydantic import BaseModel
5 5
 
6 6
 class Reaction(BaseModel):
@@ -49,18 +49,3 @@ class Message(BaseModel):
49 49
         dt = datetime.fromtimestamp(self.timestamp_ms / 1000)
50 50
         dt_str = dt.strftime("%d/%m/%Y, %H:%M:%S")
51 51
         return f"{self.sender_name}({dt_str}) : {self.content}"
52
-
53
-    def get_counted_value(self):
54
-        """
55
-        The content of the message should be (or contain) a number
56
-        """
57
-        value = None
58
-        # Remove any number that is not a digit
59
-        # TODO parse potential math expressions in content
60
-        cleaned_content = ''.join([c for c in self.content if c.isdigit()])
61
-        try:
62
-            value = floor(float(cleaned_content))
63
-        except Exception as e:
64
-            raise ValueError(
65
-                f"Message {cleaned_content} does not contain a number ({e})")
66
-        return value

+ 12
- 11
million/parse/fb_exports.py View File

@@ -1,11 +1,13 @@
1 1
 
2
-import json, os, re
2
+import os, re
3 3
 from typing import List
4 4
 
5 5
 from million.model.fb_export import FacebookExport
6 6
 
7 7
 
8 8
 def is_file_valid(file_name: str) -> bool:
9
+    # NOTE is there a way to peek inside a json file to
10
+    # check its internal structure ?
9 11
     return os.path.splitext(file_name)[-1].lower() == '.json'
10 12
 
11 13
 def valid_dirfiles(file_dir: str) -> List[str]:
@@ -17,23 +19,22 @@ def parse_file(file_name: str) -> FacebookExport:
17 19
     if not is_file_valid(file_name): return None
18 20
 
19 21
     with open(file_name, 'rb') as f:
20
-        fixed_json = __read_broken_fb_json(f.read())
21
-        json_data = json.loads(fixed_json)
22
-        return (FacebookExport(**json_data))
22
+        json_data = __read_broken_fb_json(f.read())
23
+        return FacebookExport.model_validate_json(json_data)
23 24
 
24 25
 def parse_dirfiles(file_dir: str) -> FacebookExport:
25 26
     exports = [parse_file(f) for f in valid_dirfiles(file_dir)]
26
-    if len(exports) == 0: return
27
+    
28
+    result = exports[0]
27 29
 
28
-    for other in exports[1:]:
29
-        exports[0].messages.extend(other.messages)
30
-        exports[0].participants.extend(other.participants)
30
+    for ex in exports[1:]: 
31
+        result.merge(ex)
31 32
 
32
-    exports[0].messages.sort(key = lambda m: m.timestamp_ms)
33
-    exports[0].participants = set(exports[0].participants)
34
-    return exports[0]
33
+    result.sort()
34
+    return result
35 35
 
36 36
 def __read_broken_fb_json(binary_data):
37
+    # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
37 38
     repaired = re.sub(
38 39
         rb'\\u00([\da-f]{2})',
39 40
         lambda m: bytes.fromhex(m.group(1).decode()),

+ 3
- 5
scripts/find_holes.py View File

@@ -3,14 +3,12 @@ from million.analyze.find_holes import compute_sequences, find_holes
3 3
 from million.view.bar_chart import plot as bar_chart
4 4
 from million.analyze.count_participations import count_participations
5 5
 from million.analyze.retain_counts import retain_counts
6
-from million.parse.fb_exports import FacebookExportParser
6
+import million.parse.fb_exports as fb
7 7
 
8 8
 
9 9
 DATA_PATH = './data/'
10 10
 
11
-parser = FacebookExportParser()
12
-
13
-export = parser.parse(DATA_PATH)
11
+export = fb.parse_dirfiles(DATA_PATH)
14 12
 
15 13
 filtered = retain_counts(export.messages)
16 14
 
@@ -29,7 +27,7 @@ for hole in holes:
29 27
 
30 28
 
31 29
 # lets export a csv file of the holes and the people responsible for them
32
-with open('holes.csv', 'w') as f:
30
+with open('output/holes.csv', 'w') as f:
33 31
     f.write('début,fin,taille,responsable1,responsable2,date1,date2\n')
34 32
     for hole in holes:
35 33
         date_start = datetime.utcfromtimestamp(

+ 2
- 4
scripts/read_top.py View File

@@ -1,14 +1,12 @@
1 1
 from million.view.bar_chart import plot as bar_chart
2 2
 from million.analyze.count_participations import count_participations
3 3
 from million.analyze.retain_counts import retain_counts
4
-from million.parse.fb_exports import FacebookExportParser
4
+import million.parse.fb_exports as fb
5 5
 
6 6
 
7 7
 DATA_PATH = './data/'
8 8
 
9
-parser = FacebookExportParser()
10
-
11
-export = parser.parse(DATA_PATH)
9
+export = fb.parse_dirfiles(DATA_PATH)
12 10
 
13 11
 filtered = retain_counts(export.messages)
14 12
 

Loading…
Cancel
Save