Modification des scripts en fonction du nouveau parser Ajout d'une classe servant à évaluer la valeur compter d'un message

9 months ago · d92dd386a4
--- a/million/analyze/find_holes.py
+++ b/million/analyze/find_holes.py
@@ -4,19 +4,20 @@ from typing import List
 
				
				 from million.model.hole import Hole
			
 
				
				 from million.model.message import Message
			
 
				
				 from million.model.sequence import Sequence
			
 
				
				+import million.analyze.message_evaluation as msg_ev
			
 
				
				 
			
 
				
				 
			
 
				
				 def compute_sequences(messages: List[Message], accepted_max: int = 1_000_000) -> List[Sequence]:
			
 
				
				     sequences: List[Sequence] = []
			
 
				
				     current_sequence = Sequence(
			
 
				
				-        start=messages[0].get_counted_value(),
			
 
				
				+        start = msg_ev.compute(messages[0]),
			
 
				
				         start_message=messages[0],
			
 
				
				-        end=messages[0].get_counted_value(),
			
 
				
				+        end = msg_ev.compute(messages[0]),
			
 
				
				         end_message=messages[0]
			
 
				
				     )
			
 
				
				     for i in range(1, len(messages)):
			
 
				
				         message = messages[i]
			
 
				
				-        message_value = message.get_counted_value()
			
 
				
				+        message_value = msg_ev.compute(message)
			
 
				
				         if message_value > accepted_max:
			
 
				
				             continue
			
 
				
				         if message_value - current_sequence.end == 1:
			
--- a/million/analyze/message_evaluation.py
+++ b/million/analyze/message_evaluation.py
@@ -0,0 +1,18 @@
 
				
				+from math import floor
			
 
				
				+from million.model.message import Message
			
 
				
				+
			
 
				
				+# TODO WIP
			
 
				
				+# - DNS to resolve audio, gif, pictures with counts
			
 
				
				+def compute(msg: Message) -> int:
			
 
				
				+    """ Returns the estimated value counted in this message
			
 
				
				+    """
			
 
				
				+    value = None
			
 
				
				+    # Remove any number that is not a digit
			
 
				
				+    # TODO parse potential math expressions in content
			
 
				
				+    cleaned_content = ''.join([c for c in msg.content if c.isdigit()])
			
 
				
				+    try:
			
 
				
				+        value = floor(float(cleaned_content))
			
 
				
				+    except Exception as e:
			
 
				
				+        raise ValueError(
			
 
				
				+            f"Message {cleaned_content} does not contain a number ({e})")
			
 
				
				+    return value
			
--- a/million/analyze/retain_counts.py
+++ b/million/analyze/retain_counts.py
@@ -6,11 +6,10 @@ from million.model.message import Message
 
				
				 
			
 
				
				 def retain_counts(messages : List[Message])-> List[Message]:
			
 
				
				     """
			
 
				
				-    Retain only the messages that have a content and a sender_name
			
 
				
				+    Retain only the messages that have a content
			
 
				
				     """
			
 
				
				     return [
			
 
				
				         m for m in messages 
			
 
				
				-        if m.content and 
			
 
				
				-        m.sender_name and
			
 
				
				+        if m.content and
			
 
				
				         re.search('(\d{2,}|^\d$)', m.content)
			
 
				
				         ]
			
--- a/million/model/fb_export.py
+++ b/million/model/fb_export.py
@@ -1,4 +1,6 @@
 
				
				-from typing import Any, List
			
 
				
				+from __future__ import annotations
			
 
				
				+
			
 
				
				+from typing import Any, List, Set
			
 
				
				 from pydantic import BaseModel
			
 
				
				 
			
 
				
				 from million.model.message import Message
			
@@ -8,16 +10,34 @@ class Image(BaseModel):
 
				
				     creation_timestamp: int
			
 
				
				     uri: str
			
 
				
				 
			
 
				
				+    def __eq__(self, other: Image) -> bool:
			
 
				
				+        return self.creation_timestamp == other.creation_timestamp \
			
 
				
				+            and self.uri == other.uri
			
 
				
				+
			
 
				
				 class JoinableMode(BaseModel):
			
 
				
				     mode: int
			
 
				
				     link: str
			
 
				
				 
			
 
				
				 class FacebookExport(BaseModel):
			
 
				
				     messages: List[Message]
			
 
				
				-    participants: List[Participant]
			
 
				
				+    participants: Set[Participant]
			
 
				
				     title: str
			
 
				
				     is_still_participant: bool
			
 
				
				     thread_path: str
			
 
				
				-    magic_words: List[Any]
			
 
				
				+    magic_words: Set[Any]
			
 
				
				     image: Image
			
 
				
				     joinable_mode: JoinableMode
			
 
				
				+
			
 
				
				+    def merge(self, other: FacebookExport) -> None:
			
 
				
				+        if self == other:
			
 
				
				+            self.messages.extend(other.messages)
			
 
				
				+            self.participants.update(other.participants)
			
 
				
				+            self.magic_words.update(other.magic_words)
			
 
				
				+
			
 
				
				+    def sort(self) -> None:
			
 
				
				+        self.messages.sort(key = lambda m: m.timestamp_ms)
			
 
				
				+
			
 
				
				+    # NOTE Toughen equality conditions ?
			
 
				
				+    def __eq__(self, other: FacebookExport) -> bool:
			
 
				
				+        return self.title == other.title \
			
 
				
				+            and self.image == other.image
			
--- a/million/model/message.py
+++ b/million/model/message.py
@@ -1,6 +1,6 @@
 
				
				 from datetime import datetime
			
 
				
				 from math import floor
			
 
				
				-from typing import Any, List, Optional
			
 
				
				+from typing import Any, List
			
 
				
				 from pydantic import BaseModel
			
 
				
				 
			
 
				
				 class Reaction(BaseModel):
			
@@ -49,18 +49,3 @@ class Message(BaseModel):
 
				
				         dt = datetime.fromtimestamp(self.timestamp_ms / 1000)
			
 
				
				         dt_str = dt.strftime("%d/%m/%Y, %H:%M:%S")
			
 
				
				         return f"{self.sender_name}({dt_str}) : {self.content}"
			
 
				
				-
			
 
				
				-    def get_counted_value(self):
			
 
				
				-        """
			
 
				
				-        The content of the message should be (or contain) a number
			
 
				
				-        """
			
 
				
				-        value = None
			
 
				
				-        # Remove any number that is not a digit
			
 
				
				-        # TODO parse potential math expressions in content
			
 
				
				-        cleaned_content = ''.join([c for c in self.content if c.isdigit()])
			
 
				
				-        try:
			
 
				
				-            value = floor(float(cleaned_content))
			
 
				
				-        except Exception as e:
			
 
				
				-            raise ValueError(
			
 
				
				-                f"Message {cleaned_content} does not contain a number ({e})")
			
 
				
				-        return value
			
--- a/million/parse/fb_exports.py
+++ b/million/parse/fb_exports.py
@@ -1,11 +1,13 @@
 
				
				 
			
 
				
				-import json, os, re
			
 
				
				+import os, re
			
 
				
				 from typing import List
			
 
				
				 
			
 
				
				 from million.model.fb_export import FacebookExport
			
 
				
				 
			
 
				
				 
			
 
				
				 def is_file_valid(file_name: str) -> bool:
			
 
				
				+    # NOTE is there a way to peek inside a json file to
			
 
				
				+    # check its internal structure ?
			
 
				
				     return os.path.splitext(file_name)[-1].lower() == '.json'
			
 
				
				 
			
 
				
				 def valid_dirfiles(file_dir: str) -> List[str]:
			
@@ -17,23 +19,22 @@ def parse_file(file_name: str) -> FacebookExport:
 
				
				     if not is_file_valid(file_name): return None
			
 
				
				 
			
 
				
				     with open(file_name, 'rb') as f:
			
 
				
				-        fixed_json = __read_broken_fb_json(f.read())
			
 
				
				-        json_data = json.loads(fixed_json)
			
 
				
				-        return (FacebookExport(**json_data))
			
 
				
				+        json_data = __read_broken_fb_json(f.read())
			
 
				
				+        return FacebookExport.model_validate_json(json_data)
			
 
				
				 
			
 
				
				 def parse_dirfiles(file_dir: str) -> FacebookExport:
			
 
				
				     exports = [parse_file(f) for f in valid_dirfiles(file_dir)]
			
 
				
				-    if len(exports) == 0: return
			
 
				
				+    
			
 
				
				+    result = exports[0]
			
 
				
				 
			
 
				
				-    for other in exports[1:]:
			
 
				
				-        exports[0].messages.extend(other.messages)
			
 
				
				-        exports[0].participants.extend(other.participants)
			
 
				
				+    for ex in exports[1:]: 
			
 
				
				+        result.merge(ex)
			
 
				
				 
			
 
				
				-    exports[0].messages.sort(key = lambda m: m.timestamp_ms)
			
 
				
				-    exports[0].participants = set(exports[0].participants)
			
 
				
				-    return exports[0]
			
 
				
				+    result.sort()
			
 
				
				+    return result
			
 
				
				 
			
 
				
				 def __read_broken_fb_json(binary_data):
			
 
				
				+    # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
			
 
				
				     repaired = re.sub(
			
 
				
				         rb'\\u00([\da-f]{2})',
			
 
				
				         lambda m: bytes.fromhex(m.group(1).decode()),
			
--- a/scripts/find_holes.py
+++ b/scripts/find_holes.py
@@ -3,14 +3,12 @@ from million.analyze.find_holes import compute_sequences, find_holes
 
				
				 from million.view.bar_chart import plot as bar_chart
			
 
				
				 from million.analyze.count_participations import count_participations
			
 
				
				 from million.analyze.retain_counts import retain_counts
			
 
				
				-from million.parse.fb_exports import FacebookExportParser
			
 
				
				+import million.parse.fb_exports as fb
			
 
				
				 
			
 
				
				 
			
 
				
				 DATA_PATH = './data/'
			
 
				
				 
			
 
				
				-parser = FacebookExportParser()
			
 
				
				-
			
 
				
				-export = parser.parse(DATA_PATH)
			
 
				
				+export = fb.parse_dirfiles(DATA_PATH)
			
 
				
				 
			
 
				
				 filtered = retain_counts(export.messages)
			
 
				
				 
			
@@ -29,7 +27,7 @@ for hole in holes:
 
				
				 
			
 
				
				 
			
 
				
				 # lets export a csv file of the holes and the people responsible for them
			
 
				
				-with open('holes.csv', 'w') as f:
			
 
				
				+with open('output/holes.csv', 'w') as f:
			
 
				
				     f.write('début,fin,taille,responsable1,responsable2,date1,date2\n')
			
 
				
				     for hole in holes:
			
 
				
				         date_start = datetime.utcfromtimestamp(
			
--- a/scripts/read_top.py
+++ b/scripts/read_top.py
@@ -1,14 +1,12 @@
 
				
				 from million.view.bar_chart import plot as bar_chart
			
 
				
				 from million.analyze.count_participations import count_participations
			
 
				
				 from million.analyze.retain_counts import retain_counts
			
 
				
				-from million.parse.fb_exports import FacebookExportParser
			
 
				
				+import million.parse.fb_exports as fb
			
 
				
				 
			
 
				
				 
			
 
				
				 DATA_PATH = './data/'
			
 
				
				 
			
 
				
				-parser = FacebookExportParser()
			
 
				
				-
			
 
				
				-export = parser.parse(DATA_PATH)
			
 
				
				+export = fb.parse_dirfiles(DATA_PATH)
			
 
				
				 
			
 
				
				 filtered = retain_counts(export.messages)