petites modifs de syntaxe ailleurs

1年前 · 1c6f0a2c9d
--- a/million/analyze/count_participations.py
+++ b/million/analyze/count_participations.py
@@ -1,23 +1,30 @@
 
				
				-
			
 
				
				-from typing import List
			
 
				
				+from collections import Counter
			
 
				
				+from typing import Dict, List
			
 
				
				 from million.model.message import Message
			
 
				
				 from million.model.participant import Participant
			
 
				
				 
			
 
				
				 
			
 
				
				-def count_participations(messages: List[Message], participants: List[Participant]):
			
 
				
				+def count_participations(
			
 
				
				+        messages: List[Message],
			
 
				
				+        participants: List[Participant] | None = [],
			
 
				
				+        threshold: int | None = 0
			
 
				
				+        ) -> Dict[str, int]:
			
 
				
				     """
			
 
				
				-    Count the number of messages sent by each participant
			
 
				
				+    Count the number of messages sent by each participant,\n
			
 
				
				+    you can specify a threshold to return only people having reached that many counts
			
 
				
				     """
			
 
				
				-    participations = {}
			
 
				
				-    for participant in participants:
			
 
				
				-        participations[participant.name] = 0
			
 
				
				-
			
 
				
				-    for message in messages:
			
 
				
				-        if message.sender_name not in participations:
			
 
				
				-            participations[message.sender_name] = 1
			
 
				
				-        else:
			
 
				
				-            participations[message.sender_name] += 1
			
 
				
				+    participations = dict.fromkeys([p.name for p in participants], 0)
			
 
				
				+    participations.update(Counter([m.sender_name for m in messages]))
			
 
				
				+    
			
 
				
				+    return {k: v for k,v in sorted(participations.items(), key=lambda x: -x[1]) if v >= threshold}
			
 
				
				 
			
 
				
				-    ordered_participations = sorted(
			
 
				
				-        participations.items(), key=lambda x: x[1], reverse=True)
			
 
				
				-    return [{"name": v[0], "participations": v[1]} for v in ordered_participations]
			
 
				
				+def podium(
			
 
				
				+        messages: List[Message],
			
 
				
				+        top: int,
			
 
				
				+        participants: List[Participant] | None = [],
			
 
				
				+        ) -> Dict[str, int]:
			
 
				
				+    """
			
 
				
				+    Returns the N biggest counters
			
 
				
				+    """
			
 
				
				+    cp = count_participations(messages, participants)
			
 
				
				+    return {k: cp[k] for idx, k in enumerate(cp) if idx < top}
			
--- a/million/analyze/find_holes.py
+++ b/million/analyze/find_holes.py
@@ -1,5 +1,3 @@
 
				
				-
			
 
				
				-
			
 
				
				 from typing import List
			
 
				
				 from million.model.message import Message
			
 
				
				 from million.model.sequence import Sequence
			
@@ -23,7 +21,6 @@ def compute_sequences(messages: List[Message], accepted_max: int = 1_000_000) ->
 
				
				 
			
 
				
				     return sequences            
			
 
				
				 
			
 
				
				-
			
 
				
				 def merge_duplicates(sequences: List[Sequence]) -> List[Sequence]:
			
 
				
				     """ 
			
 
				
				     Take sequences as an input and returns a list with every
			
@@ -43,8 +40,6 @@ def merge_duplicates(sequences: List[Sequence]) -> List[Sequence]:
 
				
				 
			
 
				
				     return result
			
 
				
				 
			
 
				
				-
			
 
				
				-
			
 
				
				 def invert_sequences(sequences: List[Sequence]) -> List[Sequence]:
			
 
				
				     """ 
			
 
				
				     Returns the sequences representing the spaces between
			
--- a/million/analyze/message_evaluation.py
+++ b/million/analyze/message_evaluation.py
@@ -1,38 +1,46 @@
 
				
				-from math import floor
			
 
				
				 import re
			
 
				
				 from typing import Dict
			
 
				
				 from million.model.message import Message
			
 
				
				 
			
 
				
				-memoization: Dict[Message, int] = {}
			
 
				
				 
			
 
				
				-# TODO WIP
			
 
				
				-# - DNS to resolve audio, gif, pictures with counts
			
 
				
				-def __compute__(msg: Message) -> int:
			
 
				
				-    value = __computeContent(msg)
			
 
				
				+_memoization: Dict[Message, int] = {}
			
 
				
				 
			
 
				
				-    memoization[msg] = value
			
 
				
				+
			
 
				
				+def get(msg: Message) -> int:
			
 
				
				+    """
			
 
				
				+    Returns the estimated value counted in this message
			
 
				
				+    """
			
 
				
				+    return _memoization.get(msg, _compute(msg))
			
 
				
				+
			
 
				
				+def reset(msg: Message) -> None:
			
 
				
				+    """
			
 
				
				+    Drop memorized value of this Message
			
 
				
				+    """
			
 
				
				+    if msg in _memoization:
			
 
				
				+        _memoization.pop(msg)
			
 
				
				+
			
 
				
				+def reset() -> None:
			
 
				
				+    """
			
 
				
				+    Drop every memorized message value
			
 
				
				+    """
			
 
				
				+    _memoization.clear()
			
 
				
				+
			
 
				
				+
			
 
				
				+def _compute(msg: Message) -> int:
			
 
				
				+    # TODO WIP - DNS to resolve audio, gif, pictures with counts
			
 
				
				+    value = _computeContent(msg) or \
			
 
				
				+        None
			
 
				
				+
			
 
				
				+    _memoization[msg] = value
			
 
				
				     return value
			
 
				
				 
			
 
				
				-def __computeContent(msg: Message) -> int:
			
 
				
				+def _computeContent(msg: Message) -> int:
			
 
				
				     # TODO parse potential math expressions in content
			
 
				
				-    match = re.search(r"\d+", msg.content)
			
 
				
				+    match = msg.content and re.search(r"\d+", msg.content)
			
 
				
				     
			
 
				
				     if match:
			
 
				
				-        value = int(match[0])
			
 
				
				+        value = int(match.group())
			
 
				
				     else:
			
 
				
				         value = None
			
 
				
				     
			
 
				
				-    return value
			
 
				
				-
			
 
				
				-def reset(msg: Message) -> None:
			
 
				
				-    if msg in memoization:
			
 
				
				-        memoization.pop(msg)
			
 
				
				-
			
 
				
				-def reset() -> None:
			
 
				
				-    memoization.clear()
			
 
				
				-
			
 
				
				-def get(msg: Message) -> int:
			
 
				
				-    """
			
 
				
				-    Returns the estimated value counted in this message
			
 
				
				-    """
			
 
				
				-    return memoization.get(msg, __compute__(msg))
			
 
				
				+    return value
			
--- a/million/analyze/retain_counts.py
+++ b/million/analyze/retain_counts.py
@@ -1,15 +1,10 @@
 
				
				-
			
 
				
				-import re
			
 
				
				 from typing import List
			
 
				
				 from million.model.message import Message
			
 
				
				+import million.analyze.message_evaluation as msg_val
			
 
				
				 
			
 
				
				 
			
 
				
				 def retain_counts(messages : List[Message])-> List[Message]:
			
 
				
				     """
			
 
				
				-    Retain only the messages that have a content
			
 
				
				+    Retain only the messages that have a counted value
			
 
				
				     """
			
 
				
				-    return [
			
 
				
				-        m for m in messages 
			
 
				
				-        if m.content and
			
 
				
				-        re.search('(\d{2,}|^\d$)', m.content)
			
 
				
				-        ]
			
 
				
				+    return [msg for msg in messages if msg_val.get(msg)]
			
--- a/million/analyze/word_finder.py
+++ b/million/analyze/word_finder.py
@@ -4,7 +4,7 @@ from million.model.message import Message
 
				
				 
			
 
				
				 
			
 
				
				 def _wordFilter(msg: Message, words: List[str]) -> bool:
			
 
				
				-    rgx = r"(\b"+ r'\b|\b'.join(words) + r"\b)"
			
 
				
				+    rgx = r"(\b"+ r"\b|\b".join(words) + r"\b)"
			
 
				
				     return msg.content and re.search(rgx, msg.content, re.I)
			
 
				
				 
			
 
				
				 def findWords(messages: List[Message], words: List[str]) -> List[Message]:
			
--- a/million/model/fb_export.py
+++ b/million/model/fb_export.py
@@ -1,11 +1,10 @@
 
				
				 from __future__ import annotations
			
 
				
				-
			
 
				
				 from typing import Any, List, Set
			
 
				
				 from pydantic import BaseModel
			
 
				
				-
			
 
				
				 from million.model.message import Message
			
 
				
				 from million.model.participant import Participant
			
 
				
				 
			
 
				
				+
			
 
				
				 class Image(BaseModel):
			
 
				
				     creation_timestamp: int
			
 
				
				     uri: str
			
@@ -28,6 +27,7 @@ class FacebookExport(BaseModel):
 
				
				     image: Image
			
 
				
				     joinable_mode: JoinableMode
			
 
				
				 
			
 
				
				+
			
 
				
				     def merge(self, other: FacebookExport) -> None:
			
 
				
				         if self == other:
			
 
				
				             self.messages.extend(other.messages)
			
@@ -37,7 +37,8 @@ class FacebookExport(BaseModel):
 
				
				     def sort(self) -> None:
			
 
				
				         self.messages.sort(key = lambda m: m.timestamp_ms)
			
 
				
				 
			
 
				
				-    # NOTE Toughen equality conditions ?
			
 
				
				+
			
 
				
				     def __eq__(self, other: FacebookExport) -> bool:
			
 
				
				+        # NOTE Toughen equality conditions ?
			
 
				
				         return self.title == other.title \
			
 
				
				             and self.image == other.image
			
--- a/million/model/message.py
+++ b/million/model/message.py
@@ -47,6 +47,7 @@ class Message(BaseModel):
 
				
				 
			
 
				
				     _id: str = PrivateAttr(default_factory=lambda: str(uuid4()))
			
 
				
				 
			
 
				
				+
			
 
				
				     def __str__(self) -> str:
			
 
				
				         dt = datetime.fromtimestamp(self.timestamp_ms / 1000)
			
 
				
				         dt_str = dt.strftime("%d/%m/%Y, %H:%M:%S")
			
--- a/million/model/sequence.py
+++ b/million/model/sequence.py
@@ -1,8 +1,5 @@
 
				
				 from __future__ import annotations
			
 
				
				-
			
 
				
				-from pydantic import BaseModel
			
 
				
				-import pydantic
			
 
				
				-
			
 
				
				+from pydantic import validator, BaseModel
			
 
				
				 from million.model.message import Message
			
 
				
				 import million.analyze.message_evaluation as msg_val
			
 
				
				 
			
@@ -11,7 +8,7 @@ class Sequence(BaseModel):
 
				
				     start_message: Message
			
 
				
				     end_message: Message | None = None
			
 
				
				 
			
 
				
				-    @pydantic.validator('end_message', pre=True, always=True)
			
 
				
				+    @validator('end_message', pre=True, always=True)
			
 
				
				     def default_end_message(cls, v, *, values):
			
 
				
				         return v or values['start_message'] 
			
 
				
				 
			
--- a/million/parse/fb_exports.py
+++ b/million/parse/fb_exports.py
@@ -48,6 +48,7 @@ def parse_dirfiles(file_dir: str) -> FacebookExport:
 
				
				     result.sort()
			
 
				
				     return result
			
 
				
				 
			
 
				
				+
			
 
				
				 def __read_broken_fb_json(binary_data):
			
 
				
				     # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
			
 
				
				     repaired = re.sub(
			
--- a/scripts/find_gromots.py
+++ b/scripts/find_gromots.py
@@ -1,14 +1,9 @@
 
				
				 from datetime import datetime
			
 
				
				 from million.analyze.word_finder import findWords
			
 
				
				-from million.parse.fb_exports import FacebookExportParser
			
 
				
				+import million.parse.fb_exports as fb
			
 
				
				 
			
 
				
				 
			
 
				
				 DATA_PATH = './data/'
			
 
				
				-
			
 
				
				-parser = FacebookExportParser()
			
 
				
				-
			
 
				
				-export = parser.parse(DATA_PATH)
			
 
				
				-
			
 
				
				 gros_mots = [
			
 
				
				     '.*merde.*',
			
 
				
				     'sexe',
			
@@ -27,6 +22,7 @@ gros_mots = [
 
				
				     'bais.*'
			
 
				
				     ]
			
 
				
				 
			
 
				
				+export = fb.parse_dirfiles(DATA_PATH)
			
 
				
				 msg_gros_mots = findWords(export.messages, gros_mots)
			
 
				
				 
			
 
				
				 msg_gros_mots_grp = {}
			
--- a/scripts/find_holes.py
+++ b/scripts/find_holes.py
@@ -1,5 +1,5 @@
 
				
				 from datetime import datetime
			
 
				
				-from million.analyze.find_holes import compute_sequences, find_holes
			
 
				
				+import million.analyze.find_holes as fh
			
 
				
				 from million.analyze.retain_counts import retain_counts
			
 
				
				 import million.parse.fb_exports as fb
			
 
				
				 
			
@@ -10,13 +10,15 @@ export = fb.parse_dirfiles(DATA_PATH)
 
				
				 
			
 
				
				 filtered = retain_counts(export.messages)
			
 
				
				 
			
 
				
				-sequences = compute_sequences(filtered)
			
 
				
				+sequences = fh.compute_sequences(filtered)
			
 
				
				 
			
 
				
				 actual_counted = sum([s.length() for s in sequences])
			
 
				
				 
			
 
				
				 print(f"Actual counted: {actual_counted}")
			
 
				
				 
			
 
				
				-holes = find_holes(filtered)
			
 
				
				+merged = fh.merge_duplicates(sequences)
			
 
				
				+merged = [s for s in merged if s.length() > 1]
			
 
				
				+holes = fh.find_holes(filtered)
			
 
				
				 
			
 
				
				 print(len(holes))
			
 
				
				 
			
--- a/scripts/read_top.py
+++ b/scripts/read_top.py
@@ -1,4 +1,3 @@
 
				
				-from million.view.bar_chart import plot as bar_chart
			
 
				
				 from million.analyze.count_participations import count_participations
			
 
				
				 from million.analyze.retain_counts import retain_counts
			
 
				
				 import million.parse.fb_exports as fb
			
@@ -7,15 +6,11 @@ import million.parse.fb_exports as fb
 
				
				 DATA_PATH = './data/'
			
 
				
				 
			
 
				
				 export = fb.parse_dirfiles(DATA_PATH)
			
 
				
				-
			
 
				
				 filtered = retain_counts(export.messages)
			
 
				
				 
			
 
				
				 print(len(filtered))
			
 
				
				 
			
 
				
				-counted_participations = count_participations(filtered, export.participants)
			
 
				
				-
			
 
				
				-kept_participations = [
			
 
				
				-    p for p in counted_participations if p['participations'] > 100]
			
 
				
				+participations = count_participations(filtered, export.participants, 100)
			
 
				
				 
			
 
				
				-print("\n".join(
			
 
				
				-    [f"{p['name']}: {p['participations']}" for p in kept_participations]))
			
 
				
				+for name, count in participations.items():
			
 
				
				+    print(f"{name}: {count}")