Browse Source

Refacto dans la partie analyze

petites modifs de syntaxe ailleurs
feature/message_filters
Figg 9 months ago
parent
commit
1c6f0a2c9d

+ 23
- 16
million/analyze/count_participations.py View File

1
-
2
-from typing import List
1
+from collections import Counter
2
+from typing import Dict, List
3
 from million.model.message import Message
3
 from million.model.message import Message
4
 from million.model.participant import Participant
4
 from million.model.participant import Participant
5
 
5
 
6
 
6
 
7
-def count_participations(messages: List[Message], participants: List[Participant]):
7
+def count_participations(
8
+        messages: List[Message],
9
+        participants: List[Participant] | None = [],
10
+        threshold: int | None = 0
11
+        ) -> Dict[str, int]:
8
     """
12
     """
9
-    Count the number of messages sent by each participant
13
+    Count the number of messages sent by each participant,\n
14
+    you can specify a threshold to return only people having reached that many counts
10
     """
15
     """
11
-    participations = {}
12
-    for participant in participants:
13
-        participations[participant.name] = 0
14
-
15
-    for message in messages:
16
-        if message.sender_name not in participations:
17
-            participations[message.sender_name] = 1
18
-        else:
19
-            participations[message.sender_name] += 1
16
+    participations = dict.fromkeys([p.name for p in participants], 0)
17
+    participations.update(Counter([m.sender_name for m in messages]))
18
+    
19
+    return {k: v for k,v in sorted(participations.items(), key=lambda x: -x[1]) if v >= threshold}
20
 
20
 
21
-    ordered_participations = sorted(
22
-        participations.items(), key=lambda x: x[1], reverse=True)
23
-    return [{"name": v[0], "participations": v[1]} for v in ordered_participations]
21
+def podium(
22
+        messages: List[Message],
23
+        top: int,
24
+        participants: List[Participant] | None = [],
25
+        ) -> Dict[str, int]:
26
+    """
27
+    Returns the N biggest counters
28
+    """
29
+    cp = count_participations(messages, participants)
30
+    return {k: cp[k] for idx, k in enumerate(cp) if idx < top}

+ 0
- 5
million/analyze/find_holes.py View File

1
-
2
-
3
 from typing import List
1
 from typing import List
4
 from million.model.message import Message
2
 from million.model.message import Message
5
 from million.model.sequence import Sequence
3
 from million.model.sequence import Sequence
23
 
21
 
24
     return sequences            
22
     return sequences            
25
 
23
 
26
-
27
 def merge_duplicates(sequences: List[Sequence]) -> List[Sequence]:
24
 def merge_duplicates(sequences: List[Sequence]) -> List[Sequence]:
28
     """ 
25
     """ 
29
     Take sequences as an input and returns a list with every
26
     Take sequences as an input and returns a list with every
43
 
40
 
44
     return result
41
     return result
45
 
42
 
46
-
47
-
48
 def invert_sequences(sequences: List[Sequence]) -> List[Sequence]:
43
 def invert_sequences(sequences: List[Sequence]) -> List[Sequence]:
49
     """ 
44
     """ 
50
     Returns the sequences representing the spaces between
45
     Returns the sequences representing the spaces between

+ 32
- 24
million/analyze/message_evaluation.py View File

1
-from math import floor
2
 import re
1
 import re
3
 from typing import Dict
2
 from typing import Dict
4
 from million.model.message import Message
3
 from million.model.message import Message
5
 
4
 
6
-memoization: Dict[Message, int] = {}
7
 
5
 
8
-# TODO WIP
9
-# - DNS to resolve audio, gif, pictures with counts
10
-def __compute__(msg: Message) -> int:
11
-    value = __computeContent(msg)
6
+_memoization: Dict[Message, int] = {}
12
 
7
 
13
-    memoization[msg] = value
8
+
9
+def get(msg: Message) -> int:
10
+    """
11
+    Returns the estimated value counted in this message
12
+    """
13
+    return _memoization.get(msg, _compute(msg))
14
+
15
+def reset(msg: Message) -> None:
16
+    """
17
+    Drop memorized value of this Message
18
+    """
19
+    if msg in _memoization:
20
+        _memoization.pop(msg)
21
+
22
+def reset() -> None:
23
+    """
24
+    Drop every memorized message value
25
+    """
26
+    _memoization.clear()
27
+
28
+
29
+def _compute(msg: Message) -> int:
30
+    # TODO WIP - DNS to resolve audio, gif, pictures with counts
31
+    value = _computeContent(msg) or \
32
+        None
33
+
34
+    _memoization[msg] = value
14
     return value
35
     return value
15
 
36
 
16
-def __computeContent(msg: Message) -> int:
37
+def _computeContent(msg: Message) -> int:
17
     # TODO parse potential math expressions in content
38
     # TODO parse potential math expressions in content
18
-    match = re.search(r"\d+", msg.content)
39
+    match = msg.content and re.search(r"\d+", msg.content)
19
     
40
     
20
     if match:
41
     if match:
21
-        value = int(match[0])
42
+        value = int(match.group())
22
     else:
43
     else:
23
         value = None
44
         value = None
24
     
45
     
25
-    return value
26
-
27
-def reset(msg: Message) -> None:
28
-    if msg in memoization:
29
-        memoization.pop(msg)
30
-
31
-def reset() -> None:
32
-    memoization.clear()
33
-
34
-def get(msg: Message) -> int:
35
-    """
36
-    Returns the estimated value counted in this message
37
-    """
38
-    return memoization.get(msg, __compute__(msg))
46
+    return value

+ 3
- 8
million/analyze/retain_counts.py View File

1
-
2
-import re
3
 from typing import List
1
 from typing import List
4
 from million.model.message import Message
2
 from million.model.message import Message
3
+import million.analyze.message_evaluation as msg_val
5
 
4
 
6
 
5
 
7
 def retain_counts(messages : List[Message])-> List[Message]:
6
 def retain_counts(messages : List[Message])-> List[Message]:
8
     """
7
     """
9
-    Retain only the messages that have a content
8
+    Retain only the messages that have a counted value
10
     """
9
     """
11
-    return [
12
-        m for m in messages 
13
-        if m.content and
14
-        re.search('(\d{2,}|^\d$)', m.content)
15
-        ]
10
+    return [msg for msg in messages if msg_val.get(msg)]

+ 1
- 1
million/analyze/word_finder.py View File

4
 
4
 
5
 
5
 
6
 def _wordFilter(msg: Message, words: List[str]) -> bool:
6
 def _wordFilter(msg: Message, words: List[str]) -> bool:
7
-    rgx = r"(\b"+ r'\b|\b'.join(words) + r"\b)"
7
+    rgx = r"(\b"+ r"\b|\b".join(words) + r"\b)"
8
     return msg.content and re.search(rgx, msg.content, re.I)
8
     return msg.content and re.search(rgx, msg.content, re.I)
9
 
9
 
10
 def findWords(messages: List[Message], words: List[str]) -> List[Message]:
10
 def findWords(messages: List[Message], words: List[str]) -> List[Message]:

+ 4
- 3
million/model/fb_export.py View File

1
 from __future__ import annotations
1
 from __future__ import annotations
2
-
3
 from typing import Any, List, Set
2
 from typing import Any, List, Set
4
 from pydantic import BaseModel
3
 from pydantic import BaseModel
5
-
6
 from million.model.message import Message
4
 from million.model.message import Message
7
 from million.model.participant import Participant
5
 from million.model.participant import Participant
8
 
6
 
7
+
9
 class Image(BaseModel):
8
 class Image(BaseModel):
10
     creation_timestamp: int
9
     creation_timestamp: int
11
     uri: str
10
     uri: str
28
     image: Image
27
     image: Image
29
     joinable_mode: JoinableMode
28
     joinable_mode: JoinableMode
30
 
29
 
30
+
31
     def merge(self, other: FacebookExport) -> None:
31
     def merge(self, other: FacebookExport) -> None:
32
         if self == other:
32
         if self == other:
33
             self.messages.extend(other.messages)
33
             self.messages.extend(other.messages)
37
     def sort(self) -> None:
37
     def sort(self) -> None:
38
         self.messages.sort(key = lambda m: m.timestamp_ms)
38
         self.messages.sort(key = lambda m: m.timestamp_ms)
39
 
39
 
40
-    # NOTE Toughen equality conditions ?
40
+
41
     def __eq__(self, other: FacebookExport) -> bool:
41
     def __eq__(self, other: FacebookExport) -> bool:
42
+        # NOTE Toughen equality conditions ?
42
         return self.title == other.title \
43
         return self.title == other.title \
43
             and self.image == other.image
44
             and self.image == other.image

+ 1
- 0
million/model/message.py View File

47
 
47
 
48
     _id: str = PrivateAttr(default_factory=lambda: str(uuid4()))
48
     _id: str = PrivateAttr(default_factory=lambda: str(uuid4()))
49
 
49
 
50
+
50
     def __str__(self) -> str:
51
     def __str__(self) -> str:
51
         dt = datetime.fromtimestamp(self.timestamp_ms / 1000)
52
         dt = datetime.fromtimestamp(self.timestamp_ms / 1000)
52
         dt_str = dt.strftime("%d/%m/%Y, %H:%M:%S")
53
         dt_str = dt.strftime("%d/%m/%Y, %H:%M:%S")

+ 2
- 5
million/model/sequence.py View File

1
 from __future__ import annotations
1
 from __future__ import annotations
2
-
3
-from pydantic import BaseModel
4
-import pydantic
5
-
2
+from pydantic import validator, BaseModel
6
 from million.model.message import Message
3
 from million.model.message import Message
7
 import million.analyze.message_evaluation as msg_val
4
 import million.analyze.message_evaluation as msg_val
8
 
5
 
11
     start_message: Message
8
     start_message: Message
12
     end_message: Message | None = None
9
     end_message: Message | None = None
13
 
10
 
14
-    @pydantic.validator('end_message', pre=True, always=True)
11
+    @validator('end_message', pre=True, always=True)
15
     def default_end_message(cls, v, *, values):
12
     def default_end_message(cls, v, *, values):
16
         return v or values['start_message'] 
13
         return v or values['start_message'] 
17
 
14
 

+ 1
- 0
million/parse/fb_exports.py View File

48
     result.sort()
48
     result.sort()
49
     return result
49
     return result
50
 
50
 
51
+
51
 def __read_broken_fb_json(binary_data):
52
 def __read_broken_fb_json(binary_data):
52
     # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
53
     # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
53
     repaired = re.sub(
54
     repaired = re.sub(

+ 2
- 6
scripts/find_gromots.py View File

1
 from datetime import datetime
1
 from datetime import datetime
2
 from million.analyze.word_finder import findWords
2
 from million.analyze.word_finder import findWords
3
-from million.parse.fb_exports import FacebookExportParser
3
+import million.parse.fb_exports as fb
4
 
4
 
5
 
5
 
6
 DATA_PATH = './data/'
6
 DATA_PATH = './data/'
7
-
8
-parser = FacebookExportParser()
9
-
10
-export = parser.parse(DATA_PATH)
11
-
12
 gros_mots = [
7
 gros_mots = [
13
     '.*merde.*',
8
     '.*merde.*',
14
     'sexe',
9
     'sexe',
27
     'bais.*'
22
     'bais.*'
28
     ]
23
     ]
29
 
24
 
25
+export = fb.parse_dirfiles(DATA_PATH)
30
 msg_gros_mots = findWords(export.messages, gros_mots)
26
 msg_gros_mots = findWords(export.messages, gros_mots)
31
 
27
 
32
 msg_gros_mots_grp = {}
28
 msg_gros_mots_grp = {}

+ 5
- 3
scripts/find_holes.py View File

1
 from datetime import datetime
1
 from datetime import datetime
2
-from million.analyze.find_holes import compute_sequences, find_holes
2
+import million.analyze.find_holes as fh
3
 from million.analyze.retain_counts import retain_counts
3
 from million.analyze.retain_counts import retain_counts
4
 import million.parse.fb_exports as fb
4
 import million.parse.fb_exports as fb
5
 
5
 
10
 
10
 
11
 filtered = retain_counts(export.messages)
11
 filtered = retain_counts(export.messages)
12
 
12
 
13
-sequences = compute_sequences(filtered)
13
+sequences = fh.compute_sequences(filtered)
14
 
14
 
15
 actual_counted = sum([s.length() for s in sequences])
15
 actual_counted = sum([s.length() for s in sequences])
16
 
16
 
17
 print(f"Actual counted: {actual_counted}")
17
 print(f"Actual counted: {actual_counted}")
18
 
18
 
19
-holes = find_holes(filtered)
19
+merged = fh.merge_duplicates(sequences)
20
+merged = [s for s in merged if s.length() > 1]
21
+holes = fh.find_holes(filtered)
20
 
22
 
21
 print(len(holes))
23
 print(len(holes))
22
 
24
 

+ 3
- 8
scripts/read_top.py View File

1
-from million.view.bar_chart import plot as bar_chart
2
 from million.analyze.count_participations import count_participations
1
 from million.analyze.count_participations import count_participations
3
 from million.analyze.retain_counts import retain_counts
2
 from million.analyze.retain_counts import retain_counts
4
 import million.parse.fb_exports as fb
3
 import million.parse.fb_exports as fb
7
 DATA_PATH = './data/'
6
 DATA_PATH = './data/'
8
 
7
 
9
 export = fb.parse_dirfiles(DATA_PATH)
8
 export = fb.parse_dirfiles(DATA_PATH)
10
-
11
 filtered = retain_counts(export.messages)
9
 filtered = retain_counts(export.messages)
12
 
10
 
13
 print(len(filtered))
11
 print(len(filtered))
14
 
12
 
15
-counted_participations = count_participations(filtered, export.participants)
16
-
17
-kept_participations = [
18
-    p for p in counted_participations if p['participations'] > 100]
13
+participations = count_participations(filtered, export.participants, 100)
19
 
14
 
20
-print("\n".join(
21
-    [f"{p['name']}: {p['participations']}" for p in kept_participations]))
15
+for name, count in participations.items():
16
+    print(f"{name}: {count}")

Loading…
Cancel
Save