3 コミット

作成者 SHA1 メッセージ 日付
  Figg 183acd4e97 DNS solver en place, envoie tes photos elias ptn 6ヶ月前
  Figg 1c6f0a2c9d Refacto dans la partie analyze 6ヶ月前
  Figg 27a7516a57 message implémente un identifiant généré automatiquement 6ヶ月前

+ 1
- 0
DNS ファイルの表示

@@ -0,0 +1 @@
1
+https://www.youtube.com/watch?v=mC9yute2k_Q 3000

+ 23
- 16
million/analyze/count_participations.py ファイルの表示

@@ -1,23 +1,30 @@
1
-
2
-from typing import List
1
+from collections import Counter
2
+from typing import Dict, List
3 3
 from million.model.message import Message
4 4
 from million.model.participant import Participant
5 5
 
6 6
 
7
-def count_participations(messages: List[Message], participants: List[Participant]):
7
+def count_participations(
8
+        messages: List[Message],
9
+        participants: List[Participant] | None = [],
10
+        threshold: int | None = 0
11
+        ) -> Dict[str, int]:
8 12
     """
9
-    Count the number of messages sent by each participant
13
+    Count the number of messages sent by each participant,\n
14
+    you can specify a threshold to return only people having reached that many counts
10 15
     """
11
-    participations = {}
12
-    for participant in participants:
13
-        participations[participant.name] = 0
14
-
15
-    for message in messages:
16
-        if message.sender_name not in participations:
17
-            participations[message.sender_name] = 1
18
-        else:
19
-            participations[message.sender_name] += 1
16
+    participations = dict.fromkeys([p.name for p in participants], 0)
17
+    participations.update(Counter([m.sender_name for m in messages]))
18
+    
19
+    return {k: v for k,v in sorted(participations.items(), key=lambda x: -x[1]) if v >= threshold}
20 20
 
21
-    ordered_participations = sorted(
22
-        participations.items(), key=lambda x: x[1], reverse=True)
23
-    return [{"name": v[0], "participations": v[1]} for v in ordered_participations]
21
+def podium(
22
+        messages: List[Message],
23
+        top: int,
24
+        participants: List[Participant] | None = [],
25
+        ) -> Dict[str, int]:
26
+    """
27
+    Returns the N biggest counters
28
+    """
29
+    cp = count_participations(messages, participants)
30
+    return {k: cp[k] for idx, k in enumerate(cp) if idx < top}

+ 36
- 0
million/analyze/dns_solver.py ファイルの表示

@@ -0,0 +1,36 @@
1
+from typing import Dict
2
+
3
+from pydantic import BaseModel, PrivateAttr
4
+from million.model.message import Message
5
+
6
+_default_file_path = './DNS'
7
+
8
+class DNS_solver(BaseModel):
9
+    file_path:str = _default_file_path
10
+
11
+    _bank: Dict[str, int] | None = PrivateAttr(None)
12
+
13
+    def solve(self, msg: Message) -> int:
14
+        if self._bank == None: 
15
+            self._bank = self.load(_default_file_path)
16
+
17
+        k = self._get_key(msg)
18
+        if k and k in self._bank: return self._bank[k]
19
+
20
+        return None
21
+
22
+    def load(self, file_name: str) -> Dict[str, int]:
23
+        result = {}
24
+        with open(file_name, 'r') as f:
25
+            for line in f:
26
+                a,b = line.split()
27
+                result[a] = int(b)
28
+
29
+        return result
30
+
31
+    def _get_key(self, msg: Message) -> str:
32
+
33
+        # look into msg attributes
34
+        # find uri
35
+        return (msg.share or None) and msg.share.link or \
36
+            (msg.gifs or None) and msg.gifs[0].uri

+ 0
- 5
million/analyze/find_holes.py ファイルの表示

@@ -1,5 +1,3 @@
1
-
2
-
3 1
 from typing import List
4 2
 from million.model.message import Message
5 3
 from million.model.sequence import Sequence
@@ -23,7 +21,6 @@ def compute_sequences(messages: List[Message], accepted_max: int = 1_000_000) ->
23 21
 
24 22
     return sequences            
25 23
 
26
-
27 24
 def merge_duplicates(sequences: List[Sequence]) -> List[Sequence]:
28 25
     """ 
29 26
     Take sequences as an input and returns a list with every
@@ -43,8 +40,6 @@ def merge_duplicates(sequences: List[Sequence]) -> List[Sequence]:
43 40
 
44 41
     return result
45 42
 
46
-
47
-
48 43
 def invert_sequences(sequences: List[Sequence]) -> List[Sequence]:
49 44
     """ 
50 45
     Returns the sequences representing the spaces between

+ 33
- 24
million/analyze/message_evaluation.py ファイルの表示

@@ -1,38 +1,47 @@
1
-from math import floor
2 1
 import re
3 2
 from typing import Dict
4 3
 from million.model.message import Message
4
+import million.analyze.dns_solver as dns
5 5
 
6
-memoization: Dict[Message, int] = {}
7 6
 
8
-# TODO WIP
9
-# - DNS to resolve audio, gif, pictures with counts
10
-def __compute__(msg: Message) -> int:
11
-    value = __computeContent(msg)
7
+_memoization: Dict[Message, int] = {}
8
+_dns_solver: dns.DNS_solver = dns.DNS_solver()
12 9
 
13
-    memoization[msg] = value
10
+def get(msg: Message) -> int:
11
+    """
12
+    Returns the estimated value counted in this message
13
+    """
14
+    return _memoization.get(msg, _compute(msg))
15
+
16
+def reset(msg: Message) -> None:
17
+    """
18
+    Drop memorized value of this Message
19
+    """
20
+    if msg in _memoization:
21
+        _memoization.pop(msg)
22
+
23
+def reset() -> None:
24
+    """
25
+    Drop every memorized message value
26
+    """
27
+    _memoization.clear()
28
+
29
+
30
+def _compute(msg: Message) -> int:
31
+    value = _dns_solver.solve(msg) or \
32
+        _computeContent(msg) or \
33
+        None
34
+
35
+    _memoization[msg] = value
14 36
     return value
15 37
 
16
-def __computeContent(msg: Message) -> int:
38
+def _computeContent(msg: Message) -> int:
17 39
     # TODO parse potential math expressions in content
18
-    match = re.search(r"\d+", msg.content)
40
+    match = msg.content and re.search(r"\d+", msg.content)
19 41
     
20 42
     if match:
21
-        value = int(match[0])
43
+        value = int(match.group())
22 44
     else:
23 45
         value = None
24 46
     
25
-    return value
26
-
27
-def reset(msg: Message) -> None:
28
-    if msg in memoization:
29
-        memoization.pop(msg)
30
-
31
-def reset() -> None:
32
-    memoization.clear()
33
-
34
-def get(msg: Message) -> int:
35
-    """
36
-    Returns the estimated value counted in this message
37
-    """
38
-    return memoization.get(msg, __compute__(msg))
47
+    return value

+ 3
- 8
million/analyze/retain_counts.py ファイルの表示

@@ -1,15 +1,10 @@
1
-
2
-import re
3 1
 from typing import List
4 2
 from million.model.message import Message
3
+import million.analyze.message_evaluation as msg_val
5 4
 
6 5
 
7 6
 def retain_counts(messages : List[Message])-> List[Message]:
8 7
     """
9
-    Retain only the messages that have a content
8
+    Retain only the messages that have a counted value
10 9
     """
11
-    return [
12
-        m for m in messages 
13
-        if m.content and
14
-        re.search('(\d{2,}|^\d$)', m.content)
15
-        ]
10
+    return [msg for msg in messages if msg_val.get(msg)]

+ 1
- 1
million/analyze/word_finder.py ファイルの表示

@@ -4,7 +4,7 @@ from million.model.message import Message
4 4
 
5 5
 
6 6
 def _wordFilter(msg: Message, words: List[str]) -> bool:
7
-    rgx = r"(\b"+ r'\b|\b'.join(words) + r"\b)"
7
+    rgx = r"(\b"+ r"\b|\b".join(words) + r"\b)"
8 8
     return msg.content and re.search(rgx, msg.content, re.I)
9 9
 
10 10
 def findWords(messages: List[Message], words: List[str]) -> List[Message]:

+ 4
- 3
million/model/fb_export.py ファイルの表示

@@ -1,11 +1,10 @@
1 1
 from __future__ import annotations
2
-
3 2
 from typing import Any, List, Set
4 3
 from pydantic import BaseModel
5
-
6 4
 from million.model.message import Message
7 5
 from million.model.participant import Participant
8 6
 
7
+
9 8
 class Image(BaseModel):
10 9
     creation_timestamp: int
11 10
     uri: str
@@ -28,6 +27,7 @@ class FacebookExport(BaseModel):
28 27
     image: Image
29 28
     joinable_mode: JoinableMode
30 29
 
30
+
31 31
     def merge(self, other: FacebookExport) -> None:
32 32
         if self == other:
33 33
             self.messages.extend(other.messages)
@@ -37,7 +37,8 @@ class FacebookExport(BaseModel):
37 37
     def sort(self) -> None:
38 38
         self.messages.sort(key = lambda m: m.timestamp_ms)
39 39
 
40
-    # NOTE Toughen equality conditions ?
40
+
41 41
     def __eq__(self, other: FacebookExport) -> bool:
42
+        # NOTE Toughen equality conditions ?
42 43
         return self.title == other.title \
43 44
             and self.image == other.image

+ 11
- 3
million/model/message.py ファイルの表示

@@ -1,7 +1,7 @@
1 1
 from datetime import datetime
2
-from math import floor
3 2
 from typing import Any, List
4
-from pydantic import BaseModel
3
+from uuid import uuid4
4
+from pydantic import BaseModel, PrivateAttr, computed_field
5 5
 
6 6
 class Reaction(BaseModel):
7 7
     reaction: str
@@ -45,10 +45,18 @@ class Message(BaseModel):
45 45
     is_unsent: bool | None = None
46 46
     is_geoblocked_for_viewer: bool
47 47
 
48
+    _id: str = PrivateAttr(default_factory=lambda: str(uuid4()))
49
+
50
+
48 51
     def __str__(self) -> str:
49 52
         dt = datetime.fromtimestamp(self.timestamp_ms / 1000)
50 53
         dt_str = dt.strftime("%d/%m/%Y, %H:%M:%S")
51 54
         return f"{self.sender_name}({dt_str}) : {self.content}"
52 55
 
53 56
     def __hash__(self) -> int:
54
-        return hash(self.sender_name + str(self.timestamp_ms))
57
+        return hash(self.item_id)
58
+    
59
+    @computed_field
60
+    @property
61
+    def item_id(self) -> str:
62
+        return self._id

+ 2
- 5
million/model/sequence.py ファイルの表示

@@ -1,8 +1,5 @@
1 1
 from __future__ import annotations
2
-
3
-from pydantic import BaseModel
4
-import pydantic
5
-
2
+from pydantic import validator, BaseModel
6 3
 from million.model.message import Message
7 4
 import million.analyze.message_evaluation as msg_val
8 5
 
@@ -11,7 +8,7 @@ class Sequence(BaseModel):
11 8
     start_message: Message
12 9
     end_message: Message | None = None
13 10
 
14
-    @pydantic.validator('end_message', pre=True, always=True)
11
+    @validator('end_message', pre=True, always=True)
15 12
     def default_end_message(cls, v, *, values):
16 13
         return v or values['start_message'] 
17 14
 

+ 1
- 0
million/parse/fb_exports.py ファイルの表示

@@ -48,6 +48,7 @@ def parse_dirfiles(file_dir: str) -> FacebookExport:
48 48
     result.sort()
49 49
     return result
50 50
 
51
+
51 52
 def __read_broken_fb_json(binary_data):
52 53
     # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
53 54
     repaired = re.sub(

+ 2
- 6
scripts/find_gromots.py ファイルの表示

@@ -1,14 +1,9 @@
1 1
 from datetime import datetime
2 2
 from million.analyze.word_finder import findWords
3
-from million.parse.fb_exports import FacebookExportParser
3
+import million.parse.fb_exports as fb
4 4
 
5 5
 
6 6
 DATA_PATH = './data/'
7
-
8
-parser = FacebookExportParser()
9
-
10
-export = parser.parse(DATA_PATH)
11
-
12 7
 gros_mots = [
13 8
     '.*merde.*',
14 9
     'sexe',
@@ -27,6 +22,7 @@ gros_mots = [
27 22
     'bais.*'
28 23
     ]
29 24
 
25
+export = fb.parse_dirfiles(DATA_PATH)
30 26
 msg_gros_mots = findWords(export.messages, gros_mots)
31 27
 
32 28
 msg_gros_mots_grp = {}

+ 5
- 3
scripts/find_holes.py ファイルの表示

@@ -1,5 +1,5 @@
1 1
 from datetime import datetime
2
-from million.analyze.find_holes import compute_sequences, find_holes
2
+import million.analyze.find_holes as fh
3 3
 from million.analyze.retain_counts import retain_counts
4 4
 import million.parse.fb_exports as fb
5 5
 
@@ -10,13 +10,15 @@ export = fb.parse_dirfiles(DATA_PATH)
10 10
 
11 11
 filtered = retain_counts(export.messages)
12 12
 
13
-sequences = compute_sequences(filtered)
13
+sequences = fh.compute_sequences(filtered)
14 14
 
15 15
 actual_counted = sum([s.length() for s in sequences])
16 16
 
17 17
 print(f"Actual counted: {actual_counted}")
18 18
 
19
-holes = find_holes(filtered)
19
+merged = fh.merge_duplicates(sequences)
20
+merged = [s for s in merged if s.length() > 1]
21
+holes = fh.find_holes(filtered)
20 22
 
21 23
 print(len(holes))
22 24
 

+ 3
- 8
scripts/read_top.py ファイルの表示

@@ -1,4 +1,3 @@
1
-from million.view.bar_chart import plot as bar_chart
2 1
 from million.analyze.count_participations import count_participations
3 2
 from million.analyze.retain_counts import retain_counts
4 3
 import million.parse.fb_exports as fb
@@ -7,15 +6,11 @@ import million.parse.fb_exports as fb
7 6
 DATA_PATH = './data/'
8 7
 
9 8
 export = fb.parse_dirfiles(DATA_PATH)
10
-
11 9
 filtered = retain_counts(export.messages)
12 10
 
13 11
 print(len(filtered))
14 12
 
15
-counted_participations = count_participations(filtered, export.participants)
16
-
17
-kept_participations = [
18
-    p for p in counted_participations if p['participations'] > 100]
13
+participations = count_participations(filtered, export.participants, 100)
19 14
 
20
-print("\n".join(
21
-    [f"{p['name']}: {p['participations']}" for p in kept_participations]))
15
+for name, count in participations.items():
16
+    print(f"{name}: {count}")

読み込み中…
キャンセル
保存