3 Revize

Autor SHA1 Zpráva Datum
  Figg 183acd4e97 DNS solver en place, envoie tes photos elias ptn před 9 měsíci
  Figg 1c6f0a2c9d Refacto dans la partie analyze před 9 měsíci
  Figg 27a7516a57 message implémente un identifiant généré automatiquement před 9 měsíci

+ 1
- 0
DNS Zobrazit soubor

@@ -0,0 +1 @@
1
+https://www.youtube.com/watch?v=mC9yute2k_Q 3000

+ 23
- 16
million/analyze/count_participations.py Zobrazit soubor

@@ -1,23 +1,30 @@
1
-
2
-from typing import List
1
+from collections import Counter
2
+from typing import Dict, List
3 3
 from million.model.message import Message
4 4
 from million.model.participant import Participant
5 5
 
6 6
 
7
-def count_participations(messages: List[Message], participants: List[Participant]):
7
+def count_participations(
8
+        messages: List[Message],
9
+        participants: List[Participant] | None = [],
10
+        threshold: int | None = 0
11
+        ) -> Dict[str, int]:
8 12
     """
9
-    Count the number of messages sent by each participant
13
+    Count the number of messages sent by each participant,\n
14
+    you can specify a threshold to return only people having reached that many counts
10 15
     """
11
-    participations = {}
12
-    for participant in participants:
13
-        participations[participant.name] = 0
14
-
15
-    for message in messages:
16
-        if message.sender_name not in participations:
17
-            participations[message.sender_name] = 1
18
-        else:
19
-            participations[message.sender_name] += 1
16
+    participations = dict.fromkeys([p.name for p in participants], 0)
17
+    participations.update(Counter([m.sender_name for m in messages]))
18
+    
19
+    return {k: v for k,v in sorted(participations.items(), key=lambda x: -x[1]) if v >= threshold}
20 20
 
21
-    ordered_participations = sorted(
22
-        participations.items(), key=lambda x: x[1], reverse=True)
23
-    return [{"name": v[0], "participations": v[1]} for v in ordered_participations]
21
+def podium(
22
+        messages: List[Message],
23
+        top: int,
24
+        participants: List[Participant] | None = [],
25
+        ) -> Dict[str, int]:
26
+    """
27
+    Returns the N biggest counters
28
+    """
29
+    cp = count_participations(messages, participants)
30
+    return {k: cp[k] for idx, k in enumerate(cp) if idx < top}

+ 36
- 0
million/analyze/dns_solver.py Zobrazit soubor

@@ -0,0 +1,36 @@
1
+from typing import Dict
2
+
3
+from pydantic import BaseModel, PrivateAttr
4
+from million.model.message import Message
5
+
6
+_default_file_path = './DNS'
7
+
8
+class DNS_solver(BaseModel):
9
+    file_path:str = _default_file_path
10
+
11
+    _bank: Dict[str, int] | None = PrivateAttr(None)
12
+
13
+    def solve(self, msg: Message) -> int:
14
+        if self._bank == None: 
15
+            self._bank = self.load(_default_file_path)
16
+
17
+        k = self._get_key(msg)
18
+        if k and k in self._bank: return self._bank[k]
19
+
20
+        return None
21
+
22
+    def load(self, file_name: str) -> Dict[str, int]:
23
+        result = {}
24
+        with open(file_name, 'r') as f:
25
+            for line in f:
26
+                a,b = line.split()
27
+                result[a] = int(b)
28
+
29
+        return result
30
+
31
+    def _get_key(self, msg: Message) -> str:
32
+
33
+        # look into msg attributes
34
+        # find uri
35
+        return (msg.share or None) and msg.share.link or \
36
+            (msg.gifs or None) and msg.gifs[0].uri

+ 0
- 5
million/analyze/find_holes.py Zobrazit soubor

@@ -1,5 +1,3 @@
1
-
2
-
3 1
 from typing import List
4 2
 from million.model.message import Message
5 3
 from million.model.sequence import Sequence
@@ -23,7 +21,6 @@ def compute_sequences(messages: List[Message], accepted_max: int = 1_000_000) ->
23 21
 
24 22
     return sequences            
25 23
 
26
-
27 24
 def merge_duplicates(sequences: List[Sequence]) -> List[Sequence]:
28 25
     """ 
29 26
     Take sequences as an input and returns a list with every
@@ -43,8 +40,6 @@ def merge_duplicates(sequences: List[Sequence]) -> List[Sequence]:
43 40
 
44 41
     return result
45 42
 
46
-
47
-
48 43
 def invert_sequences(sequences: List[Sequence]) -> List[Sequence]:
49 44
     """ 
50 45
     Returns the sequences representing the spaces between

+ 33
- 24
million/analyze/message_evaluation.py Zobrazit soubor

@@ -1,38 +1,47 @@
1
-from math import floor
2 1
 import re
3 2
 from typing import Dict
4 3
 from million.model.message import Message
4
+import million.analyze.dns_solver as dns
5 5
 
6
-memoization: Dict[Message, int] = {}
7 6
 
8
-# TODO WIP
9
-# - DNS to resolve audio, gif, pictures with counts
10
-def __compute__(msg: Message) -> int:
11
-    value = __computeContent(msg)
7
+_memoization: Dict[Message, int] = {}
8
+_dns_solver: dns.DNS_solver = dns.DNS_solver()
12 9
 
13
-    memoization[msg] = value
10
+def get(msg: Message) -> int:
11
+    """
12
+    Returns the estimated value counted in this message
13
+    """
14
+    return _memoization.get(msg, _compute(msg))
15
+
16
+def reset(msg: Message) -> None:
17
+    """
18
+    Drop memorized value of this Message
19
+    """
20
+    if msg in _memoization:
21
+        _memoization.pop(msg)
22
+
23
+def reset() -> None:
24
+    """
25
+    Drop every memorized message value
26
+    """
27
+    _memoization.clear()
28
+
29
+
30
+def _compute(msg: Message) -> int:
31
+    value = _dns_solver.solve(msg) or \
32
+        _computeContent(msg) or \
33
+        None
34
+
35
+    _memoization[msg] = value
14 36
     return value
15 37
 
16
-def __computeContent(msg: Message) -> int:
38
+def _computeContent(msg: Message) -> int:
17 39
     # TODO parse potential math expressions in content
18
-    match = re.search(r"\d+", msg.content)
40
+    match = msg.content and re.search(r"\d+", msg.content)
19 41
     
20 42
     if match:
21
-        value = int(match[0])
43
+        value = int(match.group())
22 44
     else:
23 45
         value = None
24 46
     
25
-    return value
26
-
27
-def reset(msg: Message) -> None:
28
-    if msg in memoization:
29
-        memoization.pop(msg)
30
-
31
-def reset() -> None:
32
-    memoization.clear()
33
-
34
-def get(msg: Message) -> int:
35
-    """
36
-    Returns the estimated value counted in this message
37
-    """
38
-    return memoization.get(msg, __compute__(msg))
47
+    return value

+ 3
- 8
million/analyze/retain_counts.py Zobrazit soubor

@@ -1,15 +1,10 @@
1
-
2
-import re
3 1
 from typing import List
4 2
 from million.model.message import Message
3
+import million.analyze.message_evaluation as msg_val
5 4
 
6 5
 
7 6
 def retain_counts(messages : List[Message])-> List[Message]:
8 7
     """
9
-    Retain only the messages that have a content
8
+    Retain only the messages that have a counted value
10 9
     """
11
-    return [
12
-        m for m in messages 
13
-        if m.content and
14
-        re.search('(\d{2,}|^\d$)', m.content)
15
-        ]
10
+    return [msg for msg in messages if msg_val.get(msg)]

+ 1
- 1
million/analyze/word_finder.py Zobrazit soubor

@@ -4,7 +4,7 @@ from million.model.message import Message
4 4
 
5 5
 
6 6
 def _wordFilter(msg: Message, words: List[str]) -> bool:
7
-    rgx = r"(\b"+ r'\b|\b'.join(words) + r"\b)"
7
+    rgx = r"(\b"+ r"\b|\b".join(words) + r"\b)"
8 8
     return msg.content and re.search(rgx, msg.content, re.I)
9 9
 
10 10
 def findWords(messages: List[Message], words: List[str]) -> List[Message]:

+ 4
- 3
million/model/fb_export.py Zobrazit soubor

@@ -1,11 +1,10 @@
1 1
 from __future__ import annotations
2
-
3 2
 from typing import Any, List, Set
4 3
 from pydantic import BaseModel
5
-
6 4
 from million.model.message import Message
7 5
 from million.model.participant import Participant
8 6
 
7
+
9 8
 class Image(BaseModel):
10 9
     creation_timestamp: int
11 10
     uri: str
@@ -28,6 +27,7 @@ class FacebookExport(BaseModel):
28 27
     image: Image
29 28
     joinable_mode: JoinableMode
30 29
 
30
+
31 31
     def merge(self, other: FacebookExport) -> None:
32 32
         if self == other:
33 33
             self.messages.extend(other.messages)
@@ -37,7 +37,8 @@ class FacebookExport(BaseModel):
37 37
     def sort(self) -> None:
38 38
         self.messages.sort(key = lambda m: m.timestamp_ms)
39 39
 
40
-    # NOTE Toughen equality conditions ?
40
+
41 41
     def __eq__(self, other: FacebookExport) -> bool:
42
+        # NOTE Toughen equality conditions ?
42 43
         return self.title == other.title \
43 44
             and self.image == other.image

+ 11
- 3
million/model/message.py Zobrazit soubor

@@ -1,7 +1,7 @@
1 1
 from datetime import datetime
2
-from math import floor
3 2
 from typing import Any, List
4
-from pydantic import BaseModel
3
+from uuid import uuid4
4
+from pydantic import BaseModel, PrivateAttr, computed_field
5 5
 
6 6
 class Reaction(BaseModel):
7 7
     reaction: str
@@ -45,10 +45,18 @@ class Message(BaseModel):
45 45
     is_unsent: bool | None = None
46 46
     is_geoblocked_for_viewer: bool
47 47
 
48
+    _id: str = PrivateAttr(default_factory=lambda: str(uuid4()))
49
+
50
+
48 51
     def __str__(self) -> str:
49 52
         dt = datetime.fromtimestamp(self.timestamp_ms / 1000)
50 53
         dt_str = dt.strftime("%d/%m/%Y, %H:%M:%S")
51 54
         return f"{self.sender_name}({dt_str}) : {self.content}"
52 55
 
53 56
     def __hash__(self) -> int:
54
-        return hash(self.sender_name + str(self.timestamp_ms))
57
+        return hash(self.item_id)
58
+    
59
+    @computed_field
60
+    @property
61
+    def item_id(self) -> str:
62
+        return self._id

+ 2
- 5
million/model/sequence.py Zobrazit soubor

@@ -1,8 +1,5 @@
1 1
 from __future__ import annotations
2
-
3
-from pydantic import BaseModel
4
-import pydantic
5
-
2
+from pydantic import validator, BaseModel
6 3
 from million.model.message import Message
7 4
 import million.analyze.message_evaluation as msg_val
8 5
 
@@ -11,7 +8,7 @@ class Sequence(BaseModel):
11 8
     start_message: Message
12 9
     end_message: Message | None = None
13 10
 
14
-    @pydantic.validator('end_message', pre=True, always=True)
11
+    @validator('end_message', pre=True, always=True)
15 12
     def default_end_message(cls, v, *, values):
16 13
         return v or values['start_message'] 
17 14
 

+ 1
- 0
million/parse/fb_exports.py Zobrazit soubor

@@ -48,6 +48,7 @@ def parse_dirfiles(file_dir: str) -> FacebookExport:
48 48
     result.sort()
49 49
     return result
50 50
 
51
+
51 52
 def __read_broken_fb_json(binary_data):
52 53
     # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
53 54
     repaired = re.sub(

+ 2
- 6
scripts/find_gromots.py Zobrazit soubor

@@ -1,14 +1,9 @@
1 1
 from datetime import datetime
2 2
 from million.analyze.word_finder import findWords
3
-from million.parse.fb_exports import FacebookExportParser
3
+import million.parse.fb_exports as fb
4 4
 
5 5
 
6 6
 DATA_PATH = './data/'
7
-
8
-parser = FacebookExportParser()
9
-
10
-export = parser.parse(DATA_PATH)
11
-
12 7
 gros_mots = [
13 8
     '.*merde.*',
14 9
     'sexe',
@@ -27,6 +22,7 @@ gros_mots = [
27 22
     'bais.*'
28 23
     ]
29 24
 
25
+export = fb.parse_dirfiles(DATA_PATH)
30 26
 msg_gros_mots = findWords(export.messages, gros_mots)
31 27
 
32 28
 msg_gros_mots_grp = {}

+ 5
- 3
scripts/find_holes.py Zobrazit soubor

@@ -1,5 +1,5 @@
1 1
 from datetime import datetime
2
-from million.analyze.find_holes import compute_sequences, find_holes
2
+import million.analyze.find_holes as fh
3 3
 from million.analyze.retain_counts import retain_counts
4 4
 import million.parse.fb_exports as fb
5 5
 
@@ -10,13 +10,15 @@ export = fb.parse_dirfiles(DATA_PATH)
10 10
 
11 11
 filtered = retain_counts(export.messages)
12 12
 
13
-sequences = compute_sequences(filtered)
13
+sequences = fh.compute_sequences(filtered)
14 14
 
15 15
 actual_counted = sum([s.length() for s in sequences])
16 16
 
17 17
 print(f"Actual counted: {actual_counted}")
18 18
 
19
-holes = find_holes(filtered)
19
+merged = fh.merge_duplicates(sequences)
20
+merged = [s for s in merged if s.length() > 1]
21
+holes = fh.find_holes(filtered)
20 22
 
21 23
 print(len(holes))
22 24
 

+ 3
- 8
scripts/read_top.py Zobrazit soubor

@@ -1,4 +1,3 @@
1
-from million.view.bar_chart import plot as bar_chart
2 1
 from million.analyze.count_participations import count_participations
3 2
 from million.analyze.retain_counts import retain_counts
4 3
 import million.parse.fb_exports as fb
@@ -7,15 +6,11 @@ import million.parse.fb_exports as fb
7 6
 DATA_PATH = './data/'
8 7
 
9 8
 export = fb.parse_dirfiles(DATA_PATH)
10
-
11 9
 filtered = retain_counts(export.messages)
12 10
 
13 11
 print(len(filtered))
14 12
 
15
-counted_participations = count_participations(filtered, export.participants)
16
-
17
-kept_participations = [
18
-    p for p in counted_participations if p['participations'] > 100]
13
+participations = count_participations(filtered, export.participants, 100)
19 14
 
20
-print("\n".join(
21
-    [f"{p['name']}: {p['participations']}" for p in kept_participations]))
15
+for name, count in participations.items():
16
+    print(f"{name}: {count}")

Načítá se…
Zrušit
Uložit