3 Commits

Author SHA1 Message Date
  Figg 183acd4e97 DNS solver en place, envoie tes photos elias ptn 9 months ago
  Figg 1c6f0a2c9d Refacto dans la partie analyze 9 months ago
  Figg 27a7516a57 message implémente un identifiant généré automatiquement 9 months ago

+ 1
- 0
DNS View File

1
+https://www.youtube.com/watch?v=mC9yute2k_Q 3000

+ 23
- 16
million/analyze/count_participations.py View File

1
-
2
-from typing import List
1
+from collections import Counter
2
+from typing import Dict, List
3
 from million.model.message import Message
3
 from million.model.message import Message
4
 from million.model.participant import Participant
4
 from million.model.participant import Participant
5
 
5
 
6
 
6
 
7
-def count_participations(messages: List[Message], participants: List[Participant]):
7
+def count_participations(
8
+        messages: List[Message],
9
+        participants: List[Participant] | None = [],
10
+        threshold: int | None = 0
11
+        ) -> Dict[str, int]:
8
     """
12
     """
9
-    Count the number of messages sent by each participant
13
+    Count the number of messages sent by each participant,\n
14
+    you can specify a threshold to return only people having reached that many counts
10
     """
15
     """
11
-    participations = {}
12
-    for participant in participants:
13
-        participations[participant.name] = 0
14
-
15
-    for message in messages:
16
-        if message.sender_name not in participations:
17
-            participations[message.sender_name] = 1
18
-        else:
19
-            participations[message.sender_name] += 1
16
+    participations = dict.fromkeys([p.name for p in participants], 0)
17
+    participations.update(Counter([m.sender_name for m in messages]))
18
+    
19
+    return {k: v for k,v in sorted(participations.items(), key=lambda x: -x[1]) if v >= threshold}
20
 
20
 
21
-    ordered_participations = sorted(
22
-        participations.items(), key=lambda x: x[1], reverse=True)
23
-    return [{"name": v[0], "participations": v[1]} for v in ordered_participations]
21
+def podium(
22
+        messages: List[Message],
23
+        top: int,
24
+        participants: List[Participant] | None = [],
25
+        ) -> Dict[str, int]:
26
+    """
27
+    Returns the N biggest counters
28
+    """
29
+    cp = count_participations(messages, participants)
30
+    return {k: cp[k] for idx, k in enumerate(cp) if idx < top}

+ 36
- 0
million/analyze/dns_solver.py View File

1
+from typing import Dict
2
+
3
+from pydantic import BaseModel, PrivateAttr
4
+from million.model.message import Message
5
+
6
+_default_file_path = './DNS'
7
+
8
+class DNS_solver(BaseModel):
9
+    file_path:str = _default_file_path
10
+
11
+    _bank: Dict[str, int] | None = PrivateAttr(None)
12
+
13
+    def solve(self, msg: Message) -> int:
14
+        if self._bank == None: 
15
+            self._bank = self.load(_default_file_path)
16
+
17
+        k = self._get_key(msg)
18
+        if k and k in self._bank: return self._bank[k]
19
+
20
+        return None
21
+
22
+    def load(self, file_name: str) -> Dict[str, int]:
23
+        result = {}
24
+        with open(file_name, 'r') as f:
25
+            for line in f:
26
+                a,b = line.split()
27
+                result[a] = int(b)
28
+
29
+        return result
30
+
31
+    def _get_key(self, msg: Message) -> str:
32
+
33
+        # look into msg attributes
34
+        # find uri
35
+        return (msg.share or None) and msg.share.link or \
36
+            (msg.gifs or None) and msg.gifs[0].uri

+ 0
- 5
million/analyze/find_holes.py View File

1
-
2
-
3
 from typing import List
1
 from typing import List
4
 from million.model.message import Message
2
 from million.model.message import Message
5
 from million.model.sequence import Sequence
3
 from million.model.sequence import Sequence
23
 
21
 
24
     return sequences            
22
     return sequences            
25
 
23
 
26
-
27
 def merge_duplicates(sequences: List[Sequence]) -> List[Sequence]:
24
 def merge_duplicates(sequences: List[Sequence]) -> List[Sequence]:
28
     """ 
25
     """ 
29
     Take sequences as an input and returns a list with every
26
     Take sequences as an input and returns a list with every
43
 
40
 
44
     return result
41
     return result
45
 
42
 
46
-
47
-
48
 def invert_sequences(sequences: List[Sequence]) -> List[Sequence]:
43
 def invert_sequences(sequences: List[Sequence]) -> List[Sequence]:
49
     """ 
44
     """ 
50
     Returns the sequences representing the spaces between
45
     Returns the sequences representing the spaces between

+ 33
- 24
million/analyze/message_evaluation.py View File

1
-from math import floor
2
 import re
1
 import re
3
 from typing import Dict
2
 from typing import Dict
4
 from million.model.message import Message
3
 from million.model.message import Message
4
+import million.analyze.dns_solver as dns
5
 
5
 
6
-memoization: Dict[Message, int] = {}
7
 
6
 
8
-# TODO WIP
9
-# - DNS to resolve audio, gif, pictures with counts
10
-def __compute__(msg: Message) -> int:
11
-    value = __computeContent(msg)
7
+_memoization: Dict[Message, int] = {}
8
+_dns_solver: dns.DNS_solver = dns.DNS_solver()
12
 
9
 
13
-    memoization[msg] = value
10
+def get(msg: Message) -> int:
11
+    """
12
+    Returns the estimated value counted in this message
13
+    """
14
+    return _memoization.get(msg, _compute(msg))
15
+
16
+def reset(msg: Message) -> None:
17
+    """
18
+    Drop memorized value of this Message
19
+    """
20
+    if msg in _memoization:
21
+        _memoization.pop(msg)
22
+
23
+def reset() -> None:
24
+    """
25
+    Drop every memorized message value
26
+    """
27
+    _memoization.clear()
28
+
29
+
30
+def _compute(msg: Message) -> int:
31
+    value = _dns_solver.solve(msg) or \
32
+        _computeContent(msg) or \
33
+        None
34
+
35
+    _memoization[msg] = value
14
     return value
36
     return value
15
 
37
 
16
-def __computeContent(msg: Message) -> int:
38
+def _computeContent(msg: Message) -> int:
17
     # TODO parse potential math expressions in content
39
     # TODO parse potential math expressions in content
18
-    match = re.search(r"\d+", msg.content)
40
+    match = msg.content and re.search(r"\d+", msg.content)
19
     
41
     
20
     if match:
42
     if match:
21
-        value = int(match[0])
43
+        value = int(match.group())
22
     else:
44
     else:
23
         value = None
45
         value = None
24
     
46
     
25
-    return value
26
-
27
-def reset(msg: Message) -> None:
28
-    if msg in memoization:
29
-        memoization.pop(msg)
30
-
31
-def reset() -> None:
32
-    memoization.clear()
33
-
34
-def get(msg: Message) -> int:
35
-    """
36
-    Returns the estimated value counted in this message
37
-    """
38
-    return memoization.get(msg, __compute__(msg))
47
+    return value

+ 3
- 8
million/analyze/retain_counts.py View File

1
-
2
-import re
3
 from typing import List
1
 from typing import List
4
 from million.model.message import Message
2
 from million.model.message import Message
3
+import million.analyze.message_evaluation as msg_val
5
 
4
 
6
 
5
 
7
 def retain_counts(messages : List[Message])-> List[Message]:
6
 def retain_counts(messages : List[Message])-> List[Message]:
8
     """
7
     """
9
-    Retain only the messages that have a content
8
+    Retain only the messages that have a counted value
10
     """
9
     """
11
-    return [
12
-        m for m in messages 
13
-        if m.content and
14
-        re.search('(\d{2,}|^\d$)', m.content)
15
-        ]
10
+    return [msg for msg in messages if msg_val.get(msg)]

+ 1
- 1
million/analyze/word_finder.py View File

4
 
4
 
5
 
5
 
6
 def _wordFilter(msg: Message, words: List[str]) -> bool:
6
 def _wordFilter(msg: Message, words: List[str]) -> bool:
7
-    rgx = r"(\b"+ r'\b|\b'.join(words) + r"\b)"
7
+    rgx = r"(\b"+ r"\b|\b".join(words) + r"\b)"
8
     return msg.content and re.search(rgx, msg.content, re.I)
8
     return msg.content and re.search(rgx, msg.content, re.I)
9
 
9
 
10
 def findWords(messages: List[Message], words: List[str]) -> List[Message]:
10
 def findWords(messages: List[Message], words: List[str]) -> List[Message]:

+ 4
- 3
million/model/fb_export.py View File

1
 from __future__ import annotations
1
 from __future__ import annotations
2
-
3
 from typing import Any, List, Set
2
 from typing import Any, List, Set
4
 from pydantic import BaseModel
3
 from pydantic import BaseModel
5
-
6
 from million.model.message import Message
4
 from million.model.message import Message
7
 from million.model.participant import Participant
5
 from million.model.participant import Participant
8
 
6
 
7
+
9
 class Image(BaseModel):
8
 class Image(BaseModel):
10
     creation_timestamp: int
9
     creation_timestamp: int
11
     uri: str
10
     uri: str
28
     image: Image
27
     image: Image
29
     joinable_mode: JoinableMode
28
     joinable_mode: JoinableMode
30
 
29
 
30
+
31
     def merge(self, other: FacebookExport) -> None:
31
     def merge(self, other: FacebookExport) -> None:
32
         if self == other:
32
         if self == other:
33
             self.messages.extend(other.messages)
33
             self.messages.extend(other.messages)
37
     def sort(self) -> None:
37
     def sort(self) -> None:
38
         self.messages.sort(key = lambda m: m.timestamp_ms)
38
         self.messages.sort(key = lambda m: m.timestamp_ms)
39
 
39
 
40
-    # NOTE Toughen equality conditions ?
40
+
41
     def __eq__(self, other: FacebookExport) -> bool:
41
     def __eq__(self, other: FacebookExport) -> bool:
42
+        # NOTE Toughen equality conditions ?
42
         return self.title == other.title \
43
         return self.title == other.title \
43
             and self.image == other.image
44
             and self.image == other.image

+ 11
- 3
million/model/message.py View File

1
 from datetime import datetime
1
 from datetime import datetime
2
-from math import floor
3
 from typing import Any, List
2
 from typing import Any, List
4
-from pydantic import BaseModel
3
+from uuid import uuid4
4
+from pydantic import BaseModel, PrivateAttr, computed_field
5
 
5
 
6
 class Reaction(BaseModel):
6
 class Reaction(BaseModel):
7
     reaction: str
7
     reaction: str
45
     is_unsent: bool | None = None
45
     is_unsent: bool | None = None
46
     is_geoblocked_for_viewer: bool
46
     is_geoblocked_for_viewer: bool
47
 
47
 
48
+    _id: str = PrivateAttr(default_factory=lambda: str(uuid4()))
49
+
50
+
48
     def __str__(self) -> str:
51
     def __str__(self) -> str:
49
         dt = datetime.fromtimestamp(self.timestamp_ms / 1000)
52
         dt = datetime.fromtimestamp(self.timestamp_ms / 1000)
50
         dt_str = dt.strftime("%d/%m/%Y, %H:%M:%S")
53
         dt_str = dt.strftime("%d/%m/%Y, %H:%M:%S")
51
         return f"{self.sender_name}({dt_str}) : {self.content}"
54
         return f"{self.sender_name}({dt_str}) : {self.content}"
52
 
55
 
53
     def __hash__(self) -> int:
56
     def __hash__(self) -> int:
54
-        return hash(self.sender_name + str(self.timestamp_ms))
57
+        return hash(self.item_id)
58
+    
59
+    @computed_field
60
+    @property
61
+    def item_id(self) -> str:
62
+        return self._id

+ 2
- 5
million/model/sequence.py View File

1
 from __future__ import annotations
1
 from __future__ import annotations
2
-
3
-from pydantic import BaseModel
4
-import pydantic
5
-
2
+from pydantic import validator, BaseModel
6
 from million.model.message import Message
3
 from million.model.message import Message
7
 import million.analyze.message_evaluation as msg_val
4
 import million.analyze.message_evaluation as msg_val
8
 
5
 
11
     start_message: Message
8
     start_message: Message
12
     end_message: Message | None = None
9
     end_message: Message | None = None
13
 
10
 
14
-    @pydantic.validator('end_message', pre=True, always=True)
11
+    @validator('end_message', pre=True, always=True)
15
     def default_end_message(cls, v, *, values):
12
     def default_end_message(cls, v, *, values):
16
         return v or values['start_message'] 
13
         return v or values['start_message'] 
17
 
14
 

+ 1
- 0
million/parse/fb_exports.py View File

48
     result.sort()
48
     result.sort()
49
     return result
49
     return result
50
 
50
 
51
+
51
 def __read_broken_fb_json(binary_data):
52
 def __read_broken_fb_json(binary_data):
52
     # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
53
     # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
53
     repaired = re.sub(
54
     repaired = re.sub(

+ 2
- 6
scripts/find_gromots.py View File

1
 from datetime import datetime
1
 from datetime import datetime
2
 from million.analyze.word_finder import findWords
2
 from million.analyze.word_finder import findWords
3
-from million.parse.fb_exports import FacebookExportParser
3
+import million.parse.fb_exports as fb
4
 
4
 
5
 
5
 
6
 DATA_PATH = './data/'
6
 DATA_PATH = './data/'
7
-
8
-parser = FacebookExportParser()
9
-
10
-export = parser.parse(DATA_PATH)
11
-
12
 gros_mots = [
7
 gros_mots = [
13
     '.*merde.*',
8
     '.*merde.*',
14
     'sexe',
9
     'sexe',
27
     'bais.*'
22
     'bais.*'
28
     ]
23
     ]
29
 
24
 
25
+export = fb.parse_dirfiles(DATA_PATH)
30
 msg_gros_mots = findWords(export.messages, gros_mots)
26
 msg_gros_mots = findWords(export.messages, gros_mots)
31
 
27
 
32
 msg_gros_mots_grp = {}
28
 msg_gros_mots_grp = {}

+ 5
- 3
scripts/find_holes.py View File

1
 from datetime import datetime
1
 from datetime import datetime
2
-from million.analyze.find_holes import compute_sequences, find_holes
2
+import million.analyze.find_holes as fh
3
 from million.analyze.retain_counts import retain_counts
3
 from million.analyze.retain_counts import retain_counts
4
 import million.parse.fb_exports as fb
4
 import million.parse.fb_exports as fb
5
 
5
 
10
 
10
 
11
 filtered = retain_counts(export.messages)
11
 filtered = retain_counts(export.messages)
12
 
12
 
13
-sequences = compute_sequences(filtered)
13
+sequences = fh.compute_sequences(filtered)
14
 
14
 
15
 actual_counted = sum([s.length() for s in sequences])
15
 actual_counted = sum([s.length() for s in sequences])
16
 
16
 
17
 print(f"Actual counted: {actual_counted}")
17
 print(f"Actual counted: {actual_counted}")
18
 
18
 
19
-holes = find_holes(filtered)
19
+merged = fh.merge_duplicates(sequences)
20
+merged = [s for s in merged if s.length() > 1]
21
+holes = fh.find_holes(filtered)
20
 
22
 
21
 print(len(holes))
23
 print(len(holes))
22
 
24
 

+ 3
- 8
scripts/read_top.py View File

1
-from million.view.bar_chart import plot as bar_chart
2
 from million.analyze.count_participations import count_participations
1
 from million.analyze.count_participations import count_participations
3
 from million.analyze.retain_counts import retain_counts
2
 from million.analyze.retain_counts import retain_counts
4
 import million.parse.fb_exports as fb
3
 import million.parse.fb_exports as fb
7
 DATA_PATH = './data/'
6
 DATA_PATH = './data/'
8
 
7
 
9
 export = fb.parse_dirfiles(DATA_PATH)
8
 export = fb.parse_dirfiles(DATA_PATH)
10
-
11
 filtered = retain_counts(export.messages)
9
 filtered = retain_counts(export.messages)
12
 
10
 
13
 print(len(filtered))
11
 print(len(filtered))
14
 
12
 
15
-counted_participations = count_participations(filtered, export.participants)
16
-
17
-kept_participations = [
18
-    p for p in counted_participations if p['participations'] > 100]
13
+participations = count_participations(filtered, export.participants, 100)
19
 
14
 
20
-print("\n".join(
21
-    [f"{p['name']}: {p['participations']}" for p in kept_participations]))
15
+for name, count in participations.items():
16
+    print(f"{name}: {count}")

Loading…
Cancel
Save