petites modifs de syntaxe ailleurs

1年前 · 1c6f0a2c9d
--- a/million/analyze/count_participations.py
+++ b/million/analyze/count_participations.py
 
															
															-
														
 
															
															-from typing import List
														
 
															
															+from collections import Counter
														
 
															
															+from typing import Dict, List
														
 
															
															 from million.model.message import Message
														
 
															
															 from million.model.participant import Participant
														
 
															
															-def count_participations(messages: List[Message], participants: List[Participant]):
														
 
															
															+def count_participations(
														
 
															
															+        messages: List[Message],
														
 
															
															+        participants: List[Participant] | None = [],
														
 
															
															+        threshold: int | None = 0
														
 
															
															+        ) -> Dict[str, int]:
														
 
															
															     """
														
 
															
															-    Count the number of messages sent by each participant
														
 
															
															+    Count the number of messages sent by each participant,\n
														
 
															
															+    you can specify a threshold to return only people having reached that many counts
														
 
															
															     """
														
 
															
															-    participations = {}
														
 
															
															-    for participant in participants:
														
 
															
															-        participations[participant.name] = 0
														
 
															
															-
														
 
															
															-    for message in messages:
														
 
															
															-        if message.sender_name not in participations:
														
 
															
															-            participations[message.sender_name] = 1
														
 
															
															-        else:
														
 
															
															-            participations[message.sender_name] += 1
														
 
															
															+    participations = dict.fromkeys([p.name for p in participants], 0)
														
 
															
															+    participations.update(Counter([m.sender_name for m in messages]))
														
 
															
															+    
														
 
															
															+    return {k: v for k,v in sorted(participations.items(), key=lambda x: -x[1]) if v >= threshold}
														
 
															
															-    ordered_participations = sorted(
														
 
															
															-        participations.items(), key=lambda x: x[1], reverse=True)
														
 
															
															-    return [{"name": v[0], "participations": v[1]} for v in ordered_participations]
														
 
															
															+def podium(
														
 
															
															+        messages: List[Message],
														
 
															
															+        top: int,
														
 
															
															+        participants: List[Participant] | None = [],
														
 
															
															+        ) -> Dict[str, int]:
														
 
															
															+    """
														
 
															
															+    Returns the N biggest counters
														
 
															
															+    """
														
 
															
															+    cp = count_participations(messages, participants)
														
 
															
															+    return {k: cp[k] for idx, k in enumerate(cp) if idx < top}
														
--- a/million/analyze/find_holes.py
+++ b/million/analyze/find_holes.py
 
															
															-
														
 
															
															-
														
 
															
															 from typing import List
														
 
															
															 from million.model.message import Message
														
 
															
															 from million.model.sequence import Sequence
														
 
															
															     return sequences            
														
 
															
															-
														
 
															
															 def merge_duplicates(sequences: List[Sequence]) -> List[Sequence]:
														
 
															
															     """ 
														
 
															
															     Take sequences as an input and returns a list with every
														
 
															
															     return result
														
 
															
															-
														
 
															
															-
														
 
															
															 def invert_sequences(sequences: List[Sequence]) -> List[Sequence]:
														
 
															
															     """ 
														
 
															
															     Returns the sequences representing the spaces between
														
--- a/million/analyze/message_evaluation.py
+++ b/million/analyze/message_evaluation.py
 
															
															-from math import floor
														
 
															
															 import re
														
 
															
															 from typing import Dict
														
 
															
															 from million.model.message import Message
														
 
															
															-memoization: Dict[Message, int] = {}
														
 
															
															-# TODO WIP
														
 
															
															-# - DNS to resolve audio, gif, pictures with counts
														
 
															
															-def __compute__(msg: Message) -> int:
														
 
															
															-    value = __computeContent(msg)
														
 
															
															+_memoization: Dict[Message, int] = {}
														
 
															
															-    memoization[msg] = value
														
 
															
															+
														
 
															
															+def get(msg: Message) -> int:
														
 
															
															+    """
														
 
															
															+    Returns the estimated value counted in this message
														
 
															
															+    """
														
 
															
															+    return _memoization.get(msg, _compute(msg))
														
 
															
															+
														
 
															
															+def reset(msg: Message) -> None:
														
 
															
															+    """
														
 
															
															+    Drop memorized value of this Message
														
 
															
															+    """
														
 
															
															+    if msg in _memoization:
														
 
															
															+        _memoization.pop(msg)
														
 
															
															+
														
 
															
															+def reset() -> None:
														
 
															
															+    """
														
 
															
															+    Drop every memorized message value
														
 
															
															+    """
														
 
															
															+    _memoization.clear()
														
 
															
															+
														
 
															
															+
														
 
															
															+def _compute(msg: Message) -> int:
														
 
															
															+    # TODO WIP - DNS to resolve audio, gif, pictures with counts
														
 
															
															+    value = _computeContent(msg) or \
														
 
															
															+        None
														
 
															
															+
														
 
															
															+    _memoization[msg] = value
														
 
															
															     return value
														
 
															
															-def __computeContent(msg: Message) -> int:
														
 
															
															+def _computeContent(msg: Message) -> int:
														
 
															
															     # TODO parse potential math expressions in content
														
 
															
															-    match = re.search(r"\d+", msg.content)
														
 
															
															+    match = msg.content and re.search(r"\d+", msg.content)
														
 
															
															     if match:
														
 
															
															-        value = int(match[0])
														
 
															
															+        value = int(match.group())
														
 
															
															     else:
														
 
															
															         value = None
														
 
															
															-    return value
														
 
															
															-
														
 
															
															-def reset(msg: Message) -> None:
														
 
															
															-    if msg in memoization:
														
 
															
															-        memoization.pop(msg)
														
 
															
															-
														
 
															
															-def reset() -> None:
														
 
															
															-    memoization.clear()
														
 
															
															-
														
 
															
															-def get(msg: Message) -> int:
														
 
															
															-    """
														
 
															
															-    Returns the estimated value counted in this message
														
 
															
															-    """
														
 
															
															-    return memoization.get(msg, __compute__(msg))
														
 
															
															+    return value
														
--- a/million/analyze/retain_counts.py
+++ b/million/analyze/retain_counts.py
 
															
															-
														
 
															
															-import re
														
 
															
															 from typing import List
														
 
															
															 from million.model.message import Message
														
 
															
															+import million.analyze.message_evaluation as msg_val
														
 
															
															 def retain_counts(messages : List[Message])-> List[Message]:
														
 
															
															     """
														
 
															
															-    Retain only the messages that have a content
														
 
															
															+    Retain only the messages that have a counted value
														
 
															
															     """
														
 
															
															-    return [
														
 
															
															-        m for m in messages 
														
 
															
															-        if m.content and
														
 
															
															-        re.search('(\d{2,}|^\d$)', m.content)
														
 
															
															-        ]
														
 
															
															+    return [msg for msg in messages if msg_val.get(msg)]
														
--- a/million/analyze/word_finder.py
+++ b/million/analyze/word_finder.py
 
															
															 def _wordFilter(msg: Message, words: List[str]) -> bool:
														
 
															
															-    rgx = r"(\b"+ r'\b|\b'.join(words) + r"\b)"
														
 
															
															+    rgx = r"(\b"+ r"\b|\b".join(words) + r"\b)"
														
 
															
															     return msg.content and re.search(rgx, msg.content, re.I)
														
 
															
															 def findWords(messages: List[Message], words: List[str]) -> List[Message]:
														
--- a/million/model/fb_export.py
+++ b/million/model/fb_export.py
 
															
															 from __future__ import annotations
														
 
															
															-
														
 
															
															 from typing import Any, List, Set
														
 
															
															 from pydantic import BaseModel
														
 
															
															-
														
 
															
															 from million.model.message import Message
														
 
															
															 from million.model.participant import Participant
														
 
															
															+
														
 
															
															 class Image(BaseModel):
														
 
															
															     creation_timestamp: int
														
 
															
															     uri: str
														
 
															
															     image: Image
														
 
															
															     joinable_mode: JoinableMode
														
 
															
															+
														
 
															
															     def merge(self, other: FacebookExport) -> None:
														
 
															
															         if self == other:
														
 
															
															             self.messages.extend(other.messages)
														
 
															
															     def sort(self) -> None:
														
 
															
															         self.messages.sort(key = lambda m: m.timestamp_ms)
														
 
															
															-    # NOTE Toughen equality conditions ?
														
 
															
															+
														
 
															
															     def __eq__(self, other: FacebookExport) -> bool:
														
 
															
															+        # NOTE Toughen equality conditions ?
														
 
															
															         return self.title == other.title \
														
 
															
															             and self.image == other.image
														
--- a/million/model/message.py
+++ b/million/model/message.py
 
															
															     _id: str = PrivateAttr(default_factory=lambda: str(uuid4()))
														
 
															
															+
														
 
															
															     def __str__(self) -> str:
														
 
															
															         dt = datetime.fromtimestamp(self.timestamp_ms / 1000)
														
 
															
															         dt_str = dt.strftime("%d/%m/%Y, %H:%M:%S")
														
--- a/million/model/sequence.py
+++ b/million/model/sequence.py
 
															
															 from __future__ import annotations
														
 
															
															-
														
 
															
															-from pydantic import BaseModel
														
 
															
															-import pydantic
														
 
															
															-
														
 
															
															+from pydantic import validator, BaseModel
														
 
															
															 from million.model.message import Message
														
 
															
															 import million.analyze.message_evaluation as msg_val
														
 
															
															     start_message: Message
														
 
															
															     end_message: Message | None = None
														
 
															
															-    @pydantic.validator('end_message', pre=True, always=True)
														
 
															
															+    @validator('end_message', pre=True, always=True)
														
 
															
															     def default_end_message(cls, v, *, values):
														
 
															
															         return v or values['start_message'] 
														
--- a/million/parse/fb_exports.py
+++ b/million/parse/fb_exports.py
 
															
															     result.sort()
														
 
															
															     return result
														
 
															
															+
														
 
															
															 def __read_broken_fb_json(binary_data):
														
 
															
															     # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
														
 
															
															     repaired = re.sub(
														
--- a/scripts/find_gromots.py
+++ b/scripts/find_gromots.py
 
															
															 from datetime import datetime
														
 
															
															 from million.analyze.word_finder import findWords
														
 
															
															-from million.parse.fb_exports import FacebookExportParser
														
 
															
															+import million.parse.fb_exports as fb
														
 
															
															 DATA_PATH = './data/'
														
 
															
															-
														
 
															
															-parser = FacebookExportParser()
														
 
															
															-
														
 
															
															-export = parser.parse(DATA_PATH)
														
 
															
															-
														
 
															
															 gros_mots = [
														
 
															
															     '.*merde.*',
														
 
															
															     'sexe',
														
 
															
															     'bais.*'
														
 
															
															     ]
														
 
															
															+export = fb.parse_dirfiles(DATA_PATH)
														
 
															
															 msg_gros_mots = findWords(export.messages, gros_mots)
														
 
															
															 msg_gros_mots_grp = {}
														
--- a/scripts/find_holes.py
+++ b/scripts/find_holes.py
 
															
															 from datetime import datetime
														
 
															
															-from million.analyze.find_holes import compute_sequences, find_holes
														
 
															
															+import million.analyze.find_holes as fh
														
 
															
															 from million.analyze.retain_counts import retain_counts
														
 
															
															 import million.parse.fb_exports as fb
														
 
															
															 filtered = retain_counts(export.messages)
														
 
															
															-sequences = compute_sequences(filtered)
														
 
															
															+sequences = fh.compute_sequences(filtered)
														
 
															
															 actual_counted = sum([s.length() for s in sequences])
														
 
															
															 print(f"Actual counted: {actual_counted}")
														
 
															
															-holes = find_holes(filtered)
														
 
															
															+merged = fh.merge_duplicates(sequences)
														
 
															
															+merged = [s for s in merged if s.length() > 1]
														
 
															
															+holes = fh.find_holes(filtered)
														
 
															
															 print(len(holes))
														
--- a/scripts/read_top.py
+++ b/scripts/read_top.py
 
															
															-from million.view.bar_chart import plot as bar_chart
														
 
															
															 from million.analyze.count_participations import count_participations
														
 
															
															 from million.analyze.retain_counts import retain_counts
														
 
															
															 import million.parse.fb_exports as fb
														
 
															
															 DATA_PATH = './data/'
														
 
															
															 export = fb.parse_dirfiles(DATA_PATH)
														
 
															
															-
														
 
															
															 filtered = retain_counts(export.messages)
														
 
															
															 print(len(filtered))
														
 
															
															-counted_participations = count_participations(filtered, export.participants)
														
 
															
															-
														
 
															
															-kept_participations = [
														
 
															
															-    p for p in counted_participations if p['participations'] > 100]
														
 
															
															+participations = count_participations(filtered, export.participants, 100)
														
 
															
															-print("\n".join(
														
 
															
															-    [f"{p['name']}: {p['participations']}" for p in kept_participations]))
														
 
															
															+for name, count in participations.items():
														
 
															
															+    print(f"{name}: {count}")