Browse Source

fix mineurs

Modification des scripts en fonction du nouveau parser
Ajout d'une classe servant à évaluer la valeur compter d'un message
pull/5/head
Figg 8 months ago
parent
commit
d92dd386a4

+ 4
- 3
million/analyze/find_holes.py View File

4
 from million.model.hole import Hole
4
 from million.model.hole import Hole
5
 from million.model.message import Message
5
 from million.model.message import Message
6
 from million.model.sequence import Sequence
6
 from million.model.sequence import Sequence
7
+import million.analyze.message_evaluation as msg_ev
7
 
8
 
8
 
9
 
9
 def compute_sequences(messages: List[Message], accepted_max: int = 1_000_000) -> List[Sequence]:
10
 def compute_sequences(messages: List[Message], accepted_max: int = 1_000_000) -> List[Sequence]:
10
     sequences: List[Sequence] = []
11
     sequences: List[Sequence] = []
11
     current_sequence = Sequence(
12
     current_sequence = Sequence(
12
-        start=messages[0].get_counted_value(),
13
+        start = msg_ev.compute(messages[0]),
13
         start_message=messages[0],
14
         start_message=messages[0],
14
-        end=messages[0].get_counted_value(),
15
+        end = msg_ev.compute(messages[0]),
15
         end_message=messages[0]
16
         end_message=messages[0]
16
     )
17
     )
17
     for i in range(1, len(messages)):
18
     for i in range(1, len(messages)):
18
         message = messages[i]
19
         message = messages[i]
19
-        message_value = message.get_counted_value()
20
+        message_value = msg_ev.compute(message)
20
         if message_value > accepted_max:
21
         if message_value > accepted_max:
21
             continue
22
             continue
22
         if message_value - current_sequence.end == 1:
23
         if message_value - current_sequence.end == 1:

+ 18
- 0
million/analyze/message_evaluation.py View File

1
+from math import floor
2
+from million.model.message import Message
3
+
4
+# TODO WIP
5
+# - DNS to resolve audio, gif, pictures with counts
6
+def compute(msg: Message) -> int:
7
+    """ Returns the estimated value counted in this message
8
+    """
9
+    value = None
10
+    # Remove any number that is not a digit
11
+    # TODO parse potential math expressions in content
12
+    cleaned_content = ''.join([c for c in msg.content if c.isdigit()])
13
+    try:
14
+        value = floor(float(cleaned_content))
15
+    except Exception as e:
16
+        raise ValueError(
17
+            f"Message {cleaned_content} does not contain a number ({e})")
18
+    return value

+ 2
- 3
million/analyze/retain_counts.py View File

6
 
6
 
7
 def retain_counts(messages : List[Message])-> List[Message]:
7
 def retain_counts(messages : List[Message])-> List[Message]:
8
     """
8
     """
9
-    Retain only the messages that have a content and a sender_name
9
+    Retain only the messages that have a content
10
     """
10
     """
11
     return [
11
     return [
12
         m for m in messages 
12
         m for m in messages 
13
-        if m.content and 
14
-        m.sender_name and
13
+        if m.content and
15
         re.search('(\d{2,}|^\d$)', m.content)
14
         re.search('(\d{2,}|^\d$)', m.content)
16
         ]
15
         ]

+ 23
- 3
million/model/fb_export.py View File

1
-from typing import Any, List
1
+from __future__ import annotations
2
+
3
+from typing import Any, List, Set
2
 from pydantic import BaseModel
4
 from pydantic import BaseModel
3
 
5
 
4
 from million.model.message import Message
6
 from million.model.message import Message
8
     creation_timestamp: int
10
     creation_timestamp: int
9
     uri: str
11
     uri: str
10
 
12
 
13
+    def __eq__(self, other: Image) -> bool:
14
+        return self.creation_timestamp == other.creation_timestamp \
15
+            and self.uri == other.uri
16
+
11
 class JoinableMode(BaseModel):
17
 class JoinableMode(BaseModel):
12
     mode: int
18
     mode: int
13
     link: str
19
     link: str
14
 
20
 
15
 class FacebookExport(BaseModel):
21
 class FacebookExport(BaseModel):
16
     messages: List[Message]
22
     messages: List[Message]
17
-    participants: List[Participant]
23
+    participants: Set[Participant]
18
     title: str
24
     title: str
19
     is_still_participant: bool
25
     is_still_participant: bool
20
     thread_path: str
26
     thread_path: str
21
-    magic_words: List[Any]
27
+    magic_words: Set[Any]
22
     image: Image
28
     image: Image
23
     joinable_mode: JoinableMode
29
     joinable_mode: JoinableMode
30
+
31
+    def merge(self, other: FacebookExport) -> None:
32
+        if self == other:
33
+            self.messages.extend(other.messages)
34
+            self.participants.update(other.participants)
35
+            self.magic_words.update(other.magic_words)
36
+
37
+    def sort(self) -> None:
38
+        self.messages.sort(key = lambda m: m.timestamp_ms)
39
+
40
+    # NOTE Toughen equality conditions ?
41
+    def __eq__(self, other: FacebookExport) -> bool:
42
+        return self.title == other.title \
43
+            and self.image == other.image

+ 1
- 16
million/model/message.py View File

1
 from datetime import datetime
1
 from datetime import datetime
2
 from math import floor
2
 from math import floor
3
-from typing import Any, List, Optional
3
+from typing import Any, List
4
 from pydantic import BaseModel
4
 from pydantic import BaseModel
5
 
5
 
6
 class Reaction(BaseModel):
6
 class Reaction(BaseModel):
49
         dt = datetime.fromtimestamp(self.timestamp_ms / 1000)
49
         dt = datetime.fromtimestamp(self.timestamp_ms / 1000)
50
         dt_str = dt.strftime("%d/%m/%Y, %H:%M:%S")
50
         dt_str = dt.strftime("%d/%m/%Y, %H:%M:%S")
51
         return f"{self.sender_name}({dt_str}) : {self.content}"
51
         return f"{self.sender_name}({dt_str}) : {self.content}"
52
-
53
-    def get_counted_value(self):
54
-        """
55
-        The content of the message should be (or contain) a number
56
-        """
57
-        value = None
58
-        # Remove any number that is not a digit
59
-        # TODO parse potential math expressions in content
60
-        cleaned_content = ''.join([c for c in self.content if c.isdigit()])
61
-        try:
62
-            value = floor(float(cleaned_content))
63
-        except Exception as e:
64
-            raise ValueError(
65
-                f"Message {cleaned_content} does not contain a number ({e})")
66
-        return value

+ 12
- 11
million/parse/fb_exports.py View File

1
 
1
 
2
-import json, os, re
2
+import os, re
3
 from typing import List
3
 from typing import List
4
 
4
 
5
 from million.model.fb_export import FacebookExport
5
 from million.model.fb_export import FacebookExport
6
 
6
 
7
 
7
 
8
 def is_file_valid(file_name: str) -> bool:
8
 def is_file_valid(file_name: str) -> bool:
9
+    # NOTE is there a way to peek inside a json file to
10
+    # check its internal structure ?
9
     return os.path.splitext(file_name)[-1].lower() == '.json'
11
     return os.path.splitext(file_name)[-1].lower() == '.json'
10
 
12
 
11
 def valid_dirfiles(file_dir: str) -> List[str]:
13
 def valid_dirfiles(file_dir: str) -> List[str]:
17
     if not is_file_valid(file_name): return None
19
     if not is_file_valid(file_name): return None
18
 
20
 
19
     with open(file_name, 'rb') as f:
21
     with open(file_name, 'rb') as f:
20
-        fixed_json = __read_broken_fb_json(f.read())
21
-        json_data = json.loads(fixed_json)
22
-        return (FacebookExport(**json_data))
22
+        json_data = __read_broken_fb_json(f.read())
23
+        return FacebookExport.model_validate_json(json_data)
23
 
24
 
24
 def parse_dirfiles(file_dir: str) -> FacebookExport:
25
 def parse_dirfiles(file_dir: str) -> FacebookExport:
25
     exports = [parse_file(f) for f in valid_dirfiles(file_dir)]
26
     exports = [parse_file(f) for f in valid_dirfiles(file_dir)]
26
-    if len(exports) == 0: return
27
+    
28
+    result = exports[0]
27
 
29
 
28
-    for other in exports[1:]:
29
-        exports[0].messages.extend(other.messages)
30
-        exports[0].participants.extend(other.participants)
30
+    for ex in exports[1:]: 
31
+        result.merge(ex)
31
 
32
 
32
-    exports[0].messages.sort(key = lambda m: m.timestamp_ms)
33
-    exports[0].participants = set(exports[0].participants)
34
-    return exports[0]
33
+    result.sort()
34
+    return result
35
 
35
 
36
 def __read_broken_fb_json(binary_data):
36
 def __read_broken_fb_json(binary_data):
37
+    # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
37
     repaired = re.sub(
38
     repaired = re.sub(
38
         rb'\\u00([\da-f]{2})',
39
         rb'\\u00([\da-f]{2})',
39
         lambda m: bytes.fromhex(m.group(1).decode()),
40
         lambda m: bytes.fromhex(m.group(1).decode()),

+ 3
- 5
scripts/find_holes.py View File

3
 from million.view.bar_chart import plot as bar_chart
3
 from million.view.bar_chart import plot as bar_chart
4
 from million.analyze.count_participations import count_participations
4
 from million.analyze.count_participations import count_participations
5
 from million.analyze.retain_counts import retain_counts
5
 from million.analyze.retain_counts import retain_counts
6
-from million.parse.fb_exports import FacebookExportParser
6
+import million.parse.fb_exports as fb
7
 
7
 
8
 
8
 
9
 DATA_PATH = './data/'
9
 DATA_PATH = './data/'
10
 
10
 
11
-parser = FacebookExportParser()
12
-
13
-export = parser.parse(DATA_PATH)
11
+export = fb.parse_dirfiles(DATA_PATH)
14
 
12
 
15
 filtered = retain_counts(export.messages)
13
 filtered = retain_counts(export.messages)
16
 
14
 
29
 
27
 
30
 
28
 
31
 # lets export a csv file of the holes and the people responsible for them
29
 # lets export a csv file of the holes and the people responsible for them
32
-with open('holes.csv', 'w') as f:
30
+with open('output/holes.csv', 'w') as f:
33
     f.write('début,fin,taille,responsable1,responsable2,date1,date2\n')
31
     f.write('début,fin,taille,responsable1,responsable2,date1,date2\n')
34
     for hole in holes:
32
     for hole in holes:
35
         date_start = datetime.utcfromtimestamp(
33
         date_start = datetime.utcfromtimestamp(

+ 2
- 4
scripts/read_top.py View File

1
 from million.view.bar_chart import plot as bar_chart
1
 from million.view.bar_chart import plot as bar_chart
2
 from million.analyze.count_participations import count_participations
2
 from million.analyze.count_participations import count_participations
3
 from million.analyze.retain_counts import retain_counts
3
 from million.analyze.retain_counts import retain_counts
4
-from million.parse.fb_exports import FacebookExportParser
4
+import million.parse.fb_exports as fb
5
 
5
 
6
 
6
 
7
 DATA_PATH = './data/'
7
 DATA_PATH = './data/'
8
 
8
 
9
-parser = FacebookExportParser()
10
-
11
-export = parser.parse(DATA_PATH)
9
+export = fb.parse_dirfiles(DATA_PATH)
12
 
10
 
13
 filtered = retain_counts(export.messages)
11
 filtered = retain_counts(export.messages)
14
 
12
 

Loading…
Cancel
Save