Parcourir la source

Refacto dans la partie analyze

petites modifs de syntaxe ailleurs
feature/message_filters
Figg il y a 8 mois
Parent
révision
1c6f0a2c9d

+ 23
- 16
million/analyze/count_participations.py Voir le fichier

@@ -1,23 +1,30 @@
1
-
2
-from typing import List
1
+from collections import Counter
2
+from typing import Dict, List
3 3
 from million.model.message import Message
4 4
 from million.model.participant import Participant
5 5
 
6 6
 
7
-def count_participations(messages: List[Message], participants: List[Participant]):
7
+def count_participations(
8
+        messages: List[Message],
9
+        participants: List[Participant] | None = [],
10
+        threshold: int | None = 0
11
+        ) -> Dict[str, int]:
8 12
     """
9
-    Count the number of messages sent by each participant
13
+    Count the number of messages sent by each participant,\n
14
+    you can specify a threshold to return only people having reached that many counts
10 15
     """
11
-    participations = {}
12
-    for participant in participants:
13
-        participations[participant.name] = 0
14
-
15
-    for message in messages:
16
-        if message.sender_name not in participations:
17
-            participations[message.sender_name] = 1
18
-        else:
19
-            participations[message.sender_name] += 1
16
+    participations = dict.fromkeys([p.name for p in participants], 0)
17
+    participations.update(Counter([m.sender_name for m in messages]))
18
+    
19
+    return {k: v for k,v in sorted(participations.items(), key=lambda x: -x[1]) if v >= threshold}
20 20
 
21
-    ordered_participations = sorted(
22
-        participations.items(), key=lambda x: x[1], reverse=True)
23
-    return [{"name": v[0], "participations": v[1]} for v in ordered_participations]
21
+def podium(
22
+        messages: List[Message],
23
+        top: int,
24
+        participants: List[Participant] | None = [],
25
+        ) -> Dict[str, int]:
26
+    """
27
+    Returns the N biggest counters
28
+    """
29
+    cp = count_participations(messages, participants)
30
+    return {k: cp[k] for idx, k in enumerate(cp) if idx < top}

+ 0
- 5
million/analyze/find_holes.py Voir le fichier

@@ -1,5 +1,3 @@
1
-
2
-
3 1
 from typing import List
4 2
 from million.model.message import Message
5 3
 from million.model.sequence import Sequence
@@ -23,7 +21,6 @@ def compute_sequences(messages: List[Message], accepted_max: int = 1_000_000) ->
23 21
 
24 22
     return sequences            
25 23
 
26
-
27 24
 def merge_duplicates(sequences: List[Sequence]) -> List[Sequence]:
28 25
     """ 
29 26
     Take sequences as an input and returns a list with every
@@ -43,8 +40,6 @@ def merge_duplicates(sequences: List[Sequence]) -> List[Sequence]:
43 40
 
44 41
     return result
45 42
 
46
-
47
-
48 43
 def invert_sequences(sequences: List[Sequence]) -> List[Sequence]:
49 44
     """ 
50 45
     Returns the sequences representing the spaces between

+ 32
- 24
million/analyze/message_evaluation.py Voir le fichier

@@ -1,38 +1,46 @@
1
-from math import floor
2 1
 import re
3 2
 from typing import Dict
4 3
 from million.model.message import Message
5 4
 
6
-memoization: Dict[Message, int] = {}
7 5
 
8
-# TODO WIP
9
-# - DNS to resolve audio, gif, pictures with counts
10
-def __compute__(msg: Message) -> int:
11
-    value = __computeContent(msg)
6
+_memoization: Dict[Message, int] = {}
12 7
 
13
-    memoization[msg] = value
8
+
9
+def get(msg: Message) -> int:
10
+    """
11
+    Returns the estimated value counted in this message
12
+    """
13
+    return _memoization.get(msg, _compute(msg))
14
+
15
+def reset(msg: Message) -> None:
16
+    """
17
+    Drop memorized value of this Message
18
+    """
19
+    if msg in _memoization:
20
+        _memoization.pop(msg)
21
+
22
+def reset() -> None:
23
+    """
24
+    Drop every memorized message value
25
+    """
26
+    _memoization.clear()
27
+
28
+
29
+def _compute(msg: Message) -> int:
30
+    # TODO WIP - DNS to resolve audio, gif, pictures with counts
31
+    value = _computeContent(msg) or \
32
+        None
33
+
34
+    _memoization[msg] = value
14 35
     return value
15 36
 
16
-def __computeContent(msg: Message) -> int:
37
+def _computeContent(msg: Message) -> int:
17 38
     # TODO parse potential math expressions in content
18
-    match = re.search(r"\d+", msg.content)
39
+    match = msg.content and re.search(r"\d+", msg.content)
19 40
     
20 41
     if match:
21
-        value = int(match[0])
42
+        value = int(match.group())
22 43
     else:
23 44
         value = None
24 45
     
25
-    return value
26
-
27
-def reset(msg: Message) -> None:
28
-    if msg in memoization:
29
-        memoization.pop(msg)
30
-
31
-def reset() -> None:
32
-    memoization.clear()
33
-
34
-def get(msg: Message) -> int:
35
-    """
36
-    Returns the estimated value counted in this message
37
-    """
38
-    return memoization.get(msg, __compute__(msg))
46
+    return value

+ 3
- 8
million/analyze/retain_counts.py Voir le fichier

@@ -1,15 +1,10 @@
1
-
2
-import re
3 1
 from typing import List
4 2
 from million.model.message import Message
3
+import million.analyze.message_evaluation as msg_val
5 4
 
6 5
 
7 6
 def retain_counts(messages : List[Message])-> List[Message]:
8 7
     """
9
-    Retain only the messages that have a content
8
+    Retain only the messages that have a counted value
10 9
     """
11
-    return [
12
-        m for m in messages 
13
-        if m.content and
14
-        re.search('(\d{2,}|^\d$)', m.content)
15
-        ]
10
+    return [msg for msg in messages if msg_val.get(msg)]

+ 1
- 1
million/analyze/word_finder.py Voir le fichier

@@ -4,7 +4,7 @@ from million.model.message import Message
4 4
 
5 5
 
6 6
 def _wordFilter(msg: Message, words: List[str]) -> bool:
7
-    rgx = r"(\b"+ r'\b|\b'.join(words) + r"\b)"
7
+    rgx = r"(\b"+ r"\b|\b".join(words) + r"\b)"
8 8
     return msg.content and re.search(rgx, msg.content, re.I)
9 9
 
10 10
 def findWords(messages: List[Message], words: List[str]) -> List[Message]:

+ 4
- 3
million/model/fb_export.py Voir le fichier

@@ -1,11 +1,10 @@
1 1
 from __future__ import annotations
2
-
3 2
 from typing import Any, List, Set
4 3
 from pydantic import BaseModel
5
-
6 4
 from million.model.message import Message
7 5
 from million.model.participant import Participant
8 6
 
7
+
9 8
 class Image(BaseModel):
10 9
     creation_timestamp: int
11 10
     uri: str
@@ -28,6 +27,7 @@ class FacebookExport(BaseModel):
28 27
     image: Image
29 28
     joinable_mode: JoinableMode
30 29
 
30
+
31 31
     def merge(self, other: FacebookExport) -> None:
32 32
         if self == other:
33 33
             self.messages.extend(other.messages)
@@ -37,7 +37,8 @@ class FacebookExport(BaseModel):
37 37
     def sort(self) -> None:
38 38
         self.messages.sort(key = lambda m: m.timestamp_ms)
39 39
 
40
-    # NOTE Toughen equality conditions ?
40
+
41 41
     def __eq__(self, other: FacebookExport) -> bool:
42
+        # NOTE Toughen equality conditions ?
42 43
         return self.title == other.title \
43 44
             and self.image == other.image

+ 1
- 0
million/model/message.py Voir le fichier

@@ -47,6 +47,7 @@ class Message(BaseModel):
47 47
 
48 48
     _id: str = PrivateAttr(default_factory=lambda: str(uuid4()))
49 49
 
50
+
50 51
     def __str__(self) -> str:
51 52
         dt = datetime.fromtimestamp(self.timestamp_ms / 1000)
52 53
         dt_str = dt.strftime("%d/%m/%Y, %H:%M:%S")

+ 2
- 5
million/model/sequence.py Voir le fichier

@@ -1,8 +1,5 @@
1 1
 from __future__ import annotations
2
-
3
-from pydantic import BaseModel
4
-import pydantic
5
-
2
+from pydantic import validator, BaseModel
6 3
 from million.model.message import Message
7 4
 import million.analyze.message_evaluation as msg_val
8 5
 
@@ -11,7 +8,7 @@ class Sequence(BaseModel):
11 8
     start_message: Message
12 9
     end_message: Message | None = None
13 10
 
14
-    @pydantic.validator('end_message', pre=True, always=True)
11
+    @validator('end_message', pre=True, always=True)
15 12
     def default_end_message(cls, v, *, values):
16 13
         return v or values['start_message'] 
17 14
 

+ 1
- 0
million/parse/fb_exports.py Voir le fichier

@@ -48,6 +48,7 @@ def parse_dirfiles(file_dir: str) -> FacebookExport:
48 48
     result.sort()
49 49
     return result
50 50
 
51
+
51 52
 def __read_broken_fb_json(binary_data):
52 53
     # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
53 54
     repaired = re.sub(

+ 2
- 6
scripts/find_gromots.py Voir le fichier

@@ -1,14 +1,9 @@
1 1
 from datetime import datetime
2 2
 from million.analyze.word_finder import findWords
3
-from million.parse.fb_exports import FacebookExportParser
3
+import million.parse.fb_exports as fb
4 4
 
5 5
 
6 6
 DATA_PATH = './data/'
7
-
8
-parser = FacebookExportParser()
9
-
10
-export = parser.parse(DATA_PATH)
11
-
12 7
 gros_mots = [
13 8
     '.*merde.*',
14 9
     'sexe',
@@ -27,6 +22,7 @@ gros_mots = [
27 22
     'bais.*'
28 23
     ]
29 24
 
25
+export = fb.parse_dirfiles(DATA_PATH)
30 26
 msg_gros_mots = findWords(export.messages, gros_mots)
31 27
 
32 28
 msg_gros_mots_grp = {}

+ 5
- 3
scripts/find_holes.py Voir le fichier

@@ -1,5 +1,5 @@
1 1
 from datetime import datetime
2
-from million.analyze.find_holes import compute_sequences, find_holes
2
+import million.analyze.find_holes as fh
3 3
 from million.analyze.retain_counts import retain_counts
4 4
 import million.parse.fb_exports as fb
5 5
 
@@ -10,13 +10,15 @@ export = fb.parse_dirfiles(DATA_PATH)
10 10
 
11 11
 filtered = retain_counts(export.messages)
12 12
 
13
-sequences = compute_sequences(filtered)
13
+sequences = fh.compute_sequences(filtered)
14 14
 
15 15
 actual_counted = sum([s.length() for s in sequences])
16 16
 
17 17
 print(f"Actual counted: {actual_counted}")
18 18
 
19
-holes = find_holes(filtered)
19
+merged = fh.merge_duplicates(sequences)
20
+merged = [s for s in merged if s.length() > 1]
21
+holes = fh.find_holes(filtered)
20 22
 
21 23
 print(len(holes))
22 24
 

+ 3
- 8
scripts/read_top.py Voir le fichier

@@ -1,4 +1,3 @@
1
-from million.view.bar_chart import plot as bar_chart
2 1
 from million.analyze.count_participations import count_participations
3 2
 from million.analyze.retain_counts import retain_counts
4 3
 import million.parse.fb_exports as fb
@@ -7,15 +6,11 @@ import million.parse.fb_exports as fb
7 6
 DATA_PATH = './data/'
8 7
 
9 8
 export = fb.parse_dirfiles(DATA_PATH)
10
-
11 9
 filtered = retain_counts(export.messages)
12 10
 
13 11
 print(len(filtered))
14 12
 
15
-counted_participations = count_participations(filtered, export.participants)
16
-
17
-kept_participations = [
18
-    p for p in counted_participations if p['participations'] > 100]
13
+participations = count_participations(filtered, export.participants, 100)
19 14
 
20
-print("\n".join(
21
-    [f"{p['name']}: {p['participations']}" for p in kept_participations]))
15
+for name, count in participations.items():
16
+    print(f"{name}: {count}")

Chargement…
Annuler
Enregistrer