#9 DEV-Mael

Open
Figg wants to merge 25 commits from DEV-Mael into master

+ 1
- 0
Dockerfile View File

@@ -4,6 +4,7 @@ FROM python:3.11-buster
4 4
 RUN pip install poetry
5 5
 
6 6
 COPY pyproject.toml poetry.lock ./
7
+COPY ./data/DefaultMediaCountMapFile ./data/DefaultMediaCountMapFile
7 8
 COPY ./million ./million
8 9
 COPY ./test ./test
9 10
 

+ 45
- 0
data/DefaultMediaCountMapFile View File

@@ -0,0 +1,45 @@
1
+94965738_575896433034148_2204307637284110336_n_2600568116938585.jpg 300
2
+84316189_161561335269481_4671060857508069376_n_821996424998042.jpg 307
3
+102407032_3165952896761223_6148002473225081360_n_2378777012422623.jpg 308
4
+90507136_213915209958557_5413143962586185728_n_1841388439336535.jpg 309
5
+104035858_782158855522425_8192024435259743235_n_782158852189092.jpg 666
6
+104123574_322212138769518_5707692183879973515_n_322212135436185.jpg 667
7
+104434027_271819697398827_5391503491326101448_n_271819694065494.jpg 1312
8
+95525936_1164142403920772_8318302524985573376_n_299590661169172.jpg 51
9
+52421236_278953813000944_2106293885134176256_n_2905362479561945.gif 1664
10
+20688788_286417625173756_1069917705378725888_n_1207474276261488.gif 666
11
+38234456_1103482206482169_3464478929153163264_n_2628411810759247.gif 1789
12
+104662968_217216135921364_332403069450983046_n_217216132588031.jpg 1914
13
+50706165_602050206907030_9269729130708992_n_3180220838709531.gif 1939
14
+92978153_661083471350851_7802534939089436672_n_585129509051794.jpg 1984
15
+104872794_272192197333677_7875491468143786127_n_272192194000344.jpg 1995
16
+49627753_540217636460822_4914566064169287680_n_2394672374158902.png 2048
17
+https://www.youtube.com/watch?v=mC9yute2k_Q 3000
18
+65681880_655872084893722_5358758350790066176_n_4051854338219896.gif 3666
19
+50165487_1987596788210079_254230440078999552_n_1007765122959718.gif 66
20
+87358105_203182734373455_1323650921388834816_n_3112616325440519.gif 4810
21
+83527853_509829533251553_144101650338938880_n_1220206091644612.gif 6369
22
+84441501_209962830394148_963121690001276928_n_736879910407241.gif 6769
23
+74608941_770967279996317_3169876449326792704_n_1581830875345515.gif 7269
24
+20505423_878434838980511_4604695143109361664_n_299194367865591.gif 666
25
+110264758_573811796634371_8422456995004556652_n_782140962427058.gif 666
26
+65182313_697973310662653_2741056482018590720_n_110058701154052.gif 666
27
+120437981_961865127657562_2352191202134666388_n_297241988832056.gif 17000
28
+133574591_2903570269883547_4546172544540158465_n_1987002451464904.gif 18000
29
+124066484_677676816444243_7811409333876486154_n_382212936779839.gif 20000
30
+130166493_156814772857168_4400190561706308563_n_1434915410205159.gif 21000
31
+131881117_200208011820369_5496884526316665472_n_614869492871247.gif 24000
32
+122477452_404917850669181_7425532495902993743_n_622781605380862.gif 25000
33
+60398112_324025954936328_3959780282919288832_n_1298063493943852.gif 26400
34
+132605238_2250185215114171_4387582615384925988_n_1562306254123019.gif 29000
35
+83715267_525261428388982_9213116445225910272_n_408695157299629.gif 30000
36
+223698887_1438459123189233_486429511094530589_n_2894947440818658.gif 40000
37
+245828092_2845723112338892_4090190909716007091_n_2845723109005559.jpg 36399
38
+246367043_846550806036575_350641140426701499_n_846550802703242.jpg 36400
39
+247417430_407831760973532_6702356361214642186_n_407831757640199.jpg 36401
40
+274103826_545381123173322_5027057711080616063_n_545381116506656.jpg 60909
41
+273830541_343104791045273_1854911206287093351_n_343104784378607.jpg 60910
42
+273907776_234819785533495_1080142729732940044_n_234819782200162.jpg 60911
43
+274242881_999574723977290_4022657018268260987_n_999574720643957.jpg 60912
44
+274008762_640199633904372_3459422682721277586_n_640199623904373.jpg 60913
45
+audioclip16419453900003855_1094480794455461.mp4 57612

+ 132
- 0
million/analyze/count_analysis.py View File

@@ -0,0 +1,132 @@
1
+from typing import List
2
+import million.analyze.message_evaluation as msg_val
3
+from million.model.message import Message
4
+
5
+
6
+def check_extra_or_missing_letter(word: str, reference: str) -> bool:
7
+    """
8
+    Cette méthode vérifie si la str word contient une et une seule lettre
9
+    de trop ou de moins par rapport à la str reference
10
+    """
11
+    len_word = len(word)
12
+    len_ref = len(reference)
13
+
14
+    if abs(len_word - len_ref) != 1:
15
+        return False
16
+
17
+    shortest = word if len_word < len_ref else reference
18
+    longest = word if len_word > len_ref else reference
19
+
20
+    for i in range(len(shortest)):
21
+        if shortest[i] != longest[i]:
22
+            return shortest[i:] == longest[i + 1 :]
23
+
24
+    return True
25
+
26
+
27
+def check_single_letter_differ(word: str, reference: str) -> bool:
28
+    """
29
+    Cette méthode vérifie si la str word contient une et une seule
30
+    lettre différente par rapport à la str reference
31
+    """
32
+    return sum(1 for x, y in zip(reference, word) if x != y) == 1
33
+
34
+
35
+def check_letter_swap(word: str, reference: str) -> bool:
36
+    """
37
+    Cette méthode vérifie si la str word contient un et un seul
38
+    échange de lettres consécutives par rapport à la str reference
39
+    """
40
+    if len(word) != len(reference):
41
+        return False
42
+
43
+    for i in range(len(word) - 1):
44
+        if word[i] != reference[i]:
45
+            return word[i + 1] + word[i] + word[i + 2 :] == reference[i:]
46
+
47
+    return False
48
+
49
+
50
+def check_typo(word: str, reference: str) -> bool:
51
+    """
52
+    Cette méthode vérifie si la str word contient une typo en se référant à la str reference
53
+    """
54
+    if len(reference) == len(word):
55
+        return check_single_letter_differ(word, reference) or check_letter_swap(
56
+            word, reference
57
+        )
58
+    else:
59
+        return check_extra_or_missing_letter(word, reference)
60
+
61
+
62
+def _check_message_concatenation(messages: List[Message], index: int, expected: int) -> bool:
63
+    """
64
+    Cette méthode détermine si la liste messages contient le compte expected à partir de l'index donné
65
+    en concaténant les valeurs des messages suivants.
66
+    Cette méthode permet de trouver un compte qui a été étalé sur plusieurs messages
67
+    """
68
+    reference = str(expected)
69
+    testing = ""
70
+
71
+    offset = 0
72
+
73
+    while len(testing) < len(reference):
74
+        next_message = messages[index + offset]
75
+        offset += 1    
76
+        if next_message.sender_name == messages[index].sender_name:
77
+            testing += str(msg_val.get(next_message))
78
+
79
+    return testing == reference
80
+
81
+
82
+def _heavy_check(messages: List[Message], index: int, expected: int) -> bool:
83
+    """
84
+    Cette méthode détermine si la liste messages contient le compte expected à partir de l'index donné.
85
+    Elle utilise pour cela des méthodes complexes qui ne permettent de trouver un résultat
86
+    seulement si on est sortis du cas nominal
87
+    """
88
+    # TODO
89
+    #   - créer une méthode pour gérer le cas où plusieurs comptages sont contenus dans le même corps de message
90
+    #   - créer une méthode pour le cas où les chiffres sont représentés par un substitut au sein du corps du message
91
+    #     i.e. un nombre écrit en toutes lettres (français ou breton), 🍁 pour 420, @Elias Cheddar pour 69
92
+    m = messages[index]
93
+    word = str(msg_val.get(m))
94
+
95
+    return _check_message_concatenation(messages, index, expected) or \
96
+        check_typo(word, str(expected)) and msg_val.get(messages[index+1]) == expected+1
97
+
98
+
99
+def _check_value_around(messages, index, expected, amplitude_after, amplitude_before):
100
+    for i in range(1, amplitude_after + 1):
101
+        if index + i < len(messages) and expected == msg_val.get(messages[index + i]):
102
+            return index + i
103
+    for i in range(1, amplitude_before + 1):
104
+        if expected == msg_val.get(messages[index - i]):
105
+            return index - i
106
+
107
+    return None
108
+
109
+
110
+def search_value_at(messages, index, expected, do_heavy_check=True, amplitude_after=1000, amplitude_before=10):
111
+    """
112
+    Cette méthode détermine si la liste messages contient le compte expected à partir de l'index donné.
113
+    Le paramètre amplitude détermine la plage où effectuer les recherches autour de l'index donné.
114
+    Le paramètre do_heavy_check précise si on doit pousser l'analyse avec des méthodes plus lourdes en cas d'échec
115
+    """
116
+    # Si le message courant contient la valeur, on renvoie
117
+    curr_value = msg_val.get(messages[index])
118
+    if expected == curr_value:
119
+        return index
120
+
121
+    # Sinon on regarde aux alentours
122
+    jump_index = _check_value_around(messages, index, expected, amplitude_after, amplitude_before)
123
+    if jump_index is not None:
124
+        return jump_index
125
+
126
+    # Enfin, si on ne trouve pas la valeur à l'index donné et dans l'amplitude donnée
127
+    # On performe une vérification lourde à cet endroit
128
+    if do_heavy_check and _heavy_check(messages, index, expected):
129
+        return index
130
+
131
+    # Si tout cela n'a rien donné, on renvoie None
132
+    return None

+ 23
- 16
million/analyze/count_participations.py View File

@@ -1,23 +1,30 @@
1
-
2
-from typing import List
1
+from collections import Counter
2
+from typing import Dict, List
3 3
 from million.model.message import Message
4 4
 from million.model.participant import Participant
5 5
 
6 6
 
7
-def count_participations(messages: List[Message], participants: List[Participant]):
7
+def count_participations(
8
+        messages: List[Message],
9
+        participants: List[Participant] | None = [],
10
+        threshold: int | None = 0
11
+        ) -> Dict[str, int]:
8 12
     """
9
-    Count the number of messages sent by each participant
13
+    Count the number of messages sent by each participant,\n
14
+    you can specify a threshold to return only people having reached that many counts
10 15
     """
11
-    participations = {}
12
-    for participant in participants:
13
-        participations[participant.name] = 0
14
-
15
-    for message in messages:
16
-        if message.sender_name not in participations:
17
-            participations[message.sender_name] = 1
18
-        else:
19
-            participations[message.sender_name] += 1
16
+    participations = dict.fromkeys([p.name for p in participants], 0)
17
+    participations.update(Counter([m.sender_name for m in messages]))
18
+    
19
+    return {k: v for k,v in sorted(participations.items(), key=lambda x: -x[1]) if v >= threshold}
20 20
 
21
-    ordered_participations = sorted(
22
-        participations.items(), key=lambda x: x[1], reverse=True)
23
-    return [{"name": v[0], "participations": v[1]} for v in ordered_participations]
21
+def podium(
22
+        messages: List[Message],
23
+        top: int,
24
+        participants: List[Participant] | None = [],
25
+        ) -> Dict[str, int]:
26
+    """
27
+    Returns the N biggest counters
28
+    """
29
+    cp = count_participations(messages, participants)
30
+    return {k: cp[k] for idx, k in enumerate(cp) if idx < top}

+ 0
- 5
million/analyze/find_holes.py View File

@@ -1,5 +1,3 @@
1
-
2
-
3 1
 from typing import List
4 2
 from million.model.message import Message
5 3
 from million.model.sequence import Sequence
@@ -23,7 +21,6 @@ def compute_sequences(messages: List[Message], accepted_max: int = 1_000_000) ->
23 21
 
24 22
     return sequences            
25 23
 
26
-
27 24
 def merge_duplicates(sequences: List[Sequence]) -> List[Sequence]:
28 25
     """ 
29 26
     Take sequences as an input and returns a list with every
@@ -43,8 +40,6 @@ def merge_duplicates(sequences: List[Sequence]) -> List[Sequence]:
43 40
 
44 41
     return result
45 42
 
46
-
47
-
48 43
 def invert_sequences(sequences: List[Sequence]) -> List[Sequence]:
49 44
     """ 
50 45
     Returns the sequences representing the spaces between

+ 38
- 0
million/analyze/media_count_mapper.py View File

@@ -0,0 +1,38 @@
1
+from os.path import basename
2
+from typing import Dict
3
+
4
+from pydantic import BaseModel, PrivateAttr
5
+from million.model.message import Message
6
+
7
+_default_file_path = 'data/DefaultMediaCountMapFile'
8
+
9
+class MediaCountMapper(BaseModel):
10
+    file_path:str = _default_file_path
11
+
12
+    _bank: Dict[str, int] | None = PrivateAttr(None)
13
+
14
+    def solve(self, msg: Message) -> int:
15
+        if self._bank == None: 
16
+            self._bank = self.load(self.file_path)
17
+
18
+        k = self._get_key(msg)
19
+        if k and k in self._bank: return self._bank[k]
20
+
21
+        return None
22
+
23
+    def load(self, file_name: str) -> Dict[str, int]:
24
+        result = {}
25
+        with open(file_name, 'r') as f:
26
+            for line in f:
27
+                a,b = line.split()
28
+                result[a] = int(b)
29
+
30
+        return result
31
+
32
+    def _get_key(self, msg: Message) -> str:
33
+
34
+        # look into msg attributes
35
+        # find uri
36
+        return (msg.share or None) and msg.share.link or \
37
+            (msg.gifs or None) and basename(msg.gifs[0].uri) or \
38
+            (msg.photos or None) and basename(msg.photos[0].uri)

+ 37
- 25
million/analyze/message_evaluation.py View File

@@ -1,38 +1,50 @@
1
-from math import floor
2 1
 import re
3 2
 from typing import Dict
4 3
 from million.model.message import Message
4
+import million.analyze.media_count_mapper as mcm
5 5
 
6
-memoization: Dict[Message, int] = {}
7 6
 
8
-# TODO WIP
9
-# - DNS to resolve audio, gif, pictures with counts
10
-def __compute__(msg: Message) -> int:
11
-    value = __computeContent(msg)
7
+_memoization: Dict[Message, int] = {}
8
+_dns_solver: mcm.MediaCountMapper = mcm.MediaCountMapper()
12 9
 
13
-    memoization[msg] = value
14
-    return value
15 10
 
16
-def __computeContent(msg: Message) -> int:
17
-    # TODO parse potential math expressions in content
18
-    match = re.search(r"\d+", msg.content)
19
-    
20
-    if match:
21
-        value = int(match[0])
22
-    else:
23
-        value = None
24
-    
25
-    return value
11
+def get(msg: Message) -> int:
12
+    """
13
+    Returns the estimated value counted in this message
14
+    """
15
+    return _memoization.get(msg, _compute(msg))
16
+
26 17
 
27 18
 def reset(msg: Message) -> None:
28
-    if msg in memoization:
29
-        memoization.pop(msg)
19
+    """
20
+    Drop memorized value of this Message
21
+    """
22
+    if msg in _memoization:
23
+        _memoization.pop(msg)
30 24
 
31
-def reset() -> None:
32
-    memoization.clear()
33 25
 
34
-def get(msg: Message) -> int:
26
+def reset() -> None:
35 27
     """
36
-    Returns the estimated value counted in this message
28
+    Drop every memorized message value
37 29
     """
38
-    return memoization.get(msg, __compute__(msg))
30
+    _memoization.clear()
31
+
32
+
33
+def _compute(msg: Message) -> int:
34
+    value = _dns_solver.solve(msg) or _computeContent(msg) or None
35
+
36
+    _memoization[msg] = value
37
+    return value
38
+
39
+
40
+def _computeContent(msg: Message) -> int:
41
+    if not msg.content:
42
+        return
43
+    
44
+    s = re.sub(r'[^\s\d.,]|[.,]{2,}',"", msg.content)
45
+    match = re.search(r"\d+", s)
46
+
47
+    if match:
48
+        return int(match.group())
49
+
50
+    return None

+ 0
- 15
million/analyze/retain_counts.py View File

@@ -1,15 +0,0 @@
1
-
2
-import re
3
-from typing import List
4
-from million.model.message import Message
5
-
6
-
7
-def retain_counts(messages : List[Message])-> List[Message]:
8
-    """
9
-    Retain only the messages that have a content
10
-    """
11
-    return [
12
-        m for m in messages 
13
-        if m.content and
14
-        re.search('(\d{2,}|^\d$)', m.content)
15
-        ]

+ 45
- 5
million/analyze/word_finder.py View File

@@ -1,11 +1,51 @@
1
+from datetime import date
1 2
 import re
3
+import million.analyze.message_evaluation as msg_val
2 4
 from typing import List
3 5
 from million.model.message import Message
4 6
 
5 7
 
6
-def _wordFilter(msg: Message, words: List[str]) -> bool:
7
-    rgx = r"(\b"+ r'\b|\b'.join(words) + r"\b)"
8
-    return msg.content and re.search(rgx, msg.content, re.I)
8
+def filter_words(messages: List[Message], words: List[str]) -> List[Message]:
9
+    """
10
+    Return every message containg the given words, you can use regex syntax inside your words
11
+    i.e. find_words(messages, ["dogs?","m(ous|ic)e"])
12
+    will search for : dog, dogs, mouse, mice
13
+    """
14
+    r_words = [rf"\b{w}\b" for w in words]
15
+    rgx = "(" + "|".join(r_words) + ")"
16
+    return [m for m in messages if m.content and re.search(rgx, m.content, re.I)]
17
+
18
+
19
+def filter_value(messages: List[Message], val: int) -> List[Message]:
20
+    """
21
+    Return every message whose value is evaluated to the given val
22
+    """
23
+    return [m for m in messages if msg_val.get(m) == val]
24
+
25
+
26
+def filter_date(messages: List[Message], other: date) -> List[Message]:
27
+    """
28
+    Return every message posted on the given date
29
+    """
30
+    return [m for m in messages if m.date_time.date() == other]
31
+
32
+
33
+def filter_neighbours(
34
+    messages: List[Message], msg: Message, amplitude: int = 10
35
+) -> List[Message]:
36
+    """
37
+    Return the messages posted just before and after the given one.
38
+    amplitude will indicate how many messages to look for in each direction
39
+    """
40
+    idx = messages.index(msg)
41
+    start_index = max(0, idx - amplitude)
42
+    end_index = min(len(messages), idx + amplitude + 1)
43
+    return messages[start_index:end_index]
44
+
45
+
46
+def retain_counts(messages: List[Message]) -> List[Message]:
47
+    """
48
+    Retain only the messages for which are considered having a counted value
49
+    """
50
+    return [msg for msg in messages if msg_val.get(msg)]
9 51
 
10
-def findWords(messages: List[Message], words: List[str]) -> List[Message]:
11
-    return filter(lambda m: _wordFilter(m, words), messages)

+ 5
- 4
million/model/fb_export.py View File

@@ -1,11 +1,10 @@
1 1
 from __future__ import annotations
2
-
3 2
 from typing import Any, List, Set
4 3
 from pydantic import BaseModel
5
-
6 4
 from million.model.message import Message
7 5
 from million.model.participant import Participant
8 6
 
7
+
9 8
 class Image(BaseModel):
10 9
     creation_timestamp: int
11 10
     uri: str
@@ -28,6 +27,7 @@ class FacebookExport(BaseModel):
28 27
     image: Image
29 28
     joinable_mode: JoinableMode
30 29
 
30
+
31 31
     def merge(self, other: FacebookExport) -> None:
32 32
         if self == other:
33 33
             self.messages.extend(other.messages)
@@ -35,9 +35,10 @@ class FacebookExport(BaseModel):
35 35
             self.magic_words.update(other.magic_words)
36 36
 
37 37
     def sort(self) -> None:
38
-        self.messages.sort(key = lambda m: m.timestamp_ms)
38
+        self.messages.sort(key = lambda m: m.date_time)
39
+
39 40
 
40
-    # NOTE Toughen equality conditions ?
41 41
     def __eq__(self, other: FacebookExport) -> bool:
42
+        # NOTE Toughen equality conditions ?
42 43
         return self.title == other.title \
43 44
             and self.image == other.image

+ 39
- 7
million/model/message.py View File

@@ -1,38 +1,46 @@
1 1
 from datetime import datetime
2
-from math import floor
3 2
 from typing import Any, List
4
-from pydantic import BaseModel
3
+from uuid import uuid4
4
+from pydantic import BaseModel, Field, PrivateAttr, computed_field, field_validator
5
+
5 6
 
6 7
 class Reaction(BaseModel):
7 8
     reaction: str
8 9
     actor: str
9 10
 
11
+
10 12
 class AudioFile(BaseModel):
11 13
     uri: str
12 14
     creation_timestamp: int
13 15
 
16
+
14 17
 class Video(BaseModel):
15 18
     uri: str
16 19
     creation_timestamp: int
17 20
 
21
+
18 22
 class Photo(BaseModel):
19 23
     uri: str
20 24
     creation_timestamp: int
21 25
 
26
+
22 27
 class Gif(BaseModel):
23 28
     uri: str
24 29
 
30
+
25 31
 class Share(BaseModel):
26 32
     link: str
27 33
     share_text: str
28 34
 
35
+
29 36
 class Sticker(BaseModel):
30 37
     uri: str
31 38
     ai_stickers: List[Any]
32 39
 
40
+
33 41
 class Message(BaseModel):
34 42
     sender_name: str
35
-    timestamp_ms: int
43
+    date_time: datetime = Field(alias="timestamp_ms")
36 44
     content: str | None = None
37 45
     sticker: Sticker | None = None
38 46
     share: Share | None = None
@@ -45,10 +53,34 @@ class Message(BaseModel):
45 53
     is_unsent: bool | None = None
46 54
     is_geoblocked_for_viewer: bool
47 55
 
56
+    _id: str = PrivateAttr(default_factory=lambda: str(uuid4()))
57
+
48 58
     def __str__(self) -> str:
49
-        dt = datetime.fromtimestamp(self.timestamp_ms / 1000)
50
-        dt_str = dt.strftime("%d/%m/%Y, %H:%M:%S")
51
-        return f"{self.sender_name}({dt_str}) : {self.content}"
59
+        dt_str = self.date_time.strftime("%d/%m/%Y, %H:%M:%S")
60
+
61
+        msg_str = f"{self.sender_name}({dt_str})"
62
+
63
+        if self.content:
64
+            msg_str += " : " + self.content
65
+        if self.photos:
66
+            msg_str += f" [PHOTOS {len(self.photos)}]"
67
+        if self.videos:
68
+            msg_str += f" [VIDEOS {len(self.videos)}]"
69
+        if self.gifs:
70
+            msg_str += f" [GIFS {len(self.gifs)}]"
71
+
72
+        return msg_str
52 73
 
53 74
     def __hash__(self) -> int:
54
-        return hash(self.sender_name + str(self.timestamp_ms))
75
+        return hash(self.item_id)
76
+
77
+    @computed_field
78
+    @property
79
+    def item_id(self) -> str:
80
+        return self._id
81
+
82
+    @field_validator("date_time")
83
+    def parse_timestamp(cls, v):
84
+        if isinstance(v, int):
85
+            return datetime.fromtimestamp(v / 1000)
86
+        return v

+ 2
- 5
million/model/sequence.py View File

@@ -1,8 +1,5 @@
1 1
 from __future__ import annotations
2
-
3
-from pydantic import BaseModel
4
-import pydantic
5
-
2
+from pydantic import validator, BaseModel
6 3
 from million.model.message import Message
7 4
 import million.analyze.message_evaluation as msg_val
8 5
 
@@ -11,7 +8,7 @@ class Sequence(BaseModel):
11 8
     start_message: Message
12 9
     end_message: Message | None = None
13 10
 
14
-    @pydantic.validator('end_message', pre=True, always=True)
11
+    @validator('end_message', pre=True, always=True)
15 12
     def default_end_message(cls, v, *, values):
16 13
         return v or values['start_message'] 
17 14
 

+ 1
- 0
million/parse/fb_exports.py View File

@@ -48,6 +48,7 @@ def parse_dirfiles(file_dir: str) -> FacebookExport:
48 48
     result.sort()
49 49
     return result
50 50
 
51
+
51 52
 def __read_broken_fb_json(binary_data):
52 53
     # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
53 54
     repaired = re.sub(

+ 5
- 11
scripts/find_gromots.py View File

@@ -1,14 +1,8 @@
1
-from datetime import datetime
2
-from million.analyze.word_finder import findWords
3
-from million.parse.fb_exports import FacebookExportParser
1
+from million.analyze.word_finder import filter_words
2
+import million.parse.fb_exports as fb
4 3
 
5 4
 
6 5
 DATA_PATH = './data/'
7
-
8
-parser = FacebookExportParser()
9
-
10
-export = parser.parse(DATA_PATH)
11
-
12 6
 gros_mots = [
13 7
     '.*merde.*',
14 8
     'sexe',
@@ -30,7 +24,8 @@ gros_mots = [
30 24
     'pti?n'
31 25
     ]
32 26
 
33
-msg_gros_mots = findWords(export.messages, gros_mots)
27
+export = fb.parse_dirfiles(DATA_PATH)
28
+msg_gros_mots = filter_words(export.messages, gros_mots)
34 29
 
35 30
 msg_gros_mots_grp = {}
36 31
 
@@ -42,6 +37,5 @@ for name in sorted(msg_gros_mots_grp, key = lambda k: len(msg_gros_mots_grp[k]))
42 37
     print(name)
43 38
 
44 39
     for msg in msg_gros_mots_grp[name]:
45
-        time = datetime.fromtimestamp(msg.timestamp_ms / 1000)
46
-        time_str = time.strftime("%d/%m/%Y %H:%M:%S")
40
+        time_str = msg.date_time.strftime("%d/%m/%Y %H:%M:%S")
47 41
         print(f"\t{time_str} : {msg.content}")

+ 11
- 11
scripts/find_holes.py View File

@@ -1,22 +1,24 @@
1 1
 from datetime import datetime
2
-from million.analyze.find_holes import compute_sequences, find_holes
3
-from million.analyze.retain_counts import retain_counts
2
+import million.analyze.find_holes as fh
3
+from million.analyze.word_finder import retain_counts
4 4
 import million.parse.fb_exports as fb
5 5
 
6 6
 
7
-DATA_PATH = './data/'
7
+DATA_PATH = "./data/"
8 8
 
9 9
 export = fb.parse_dirfiles(DATA_PATH)
10 10
 
11 11
 filtered = retain_counts(export.messages)
12 12
 
13
-sequences = compute_sequences(filtered)
13
+sequences = fh.compute_sequences(filtered)
14 14
 
15 15
 actual_counted = sum([s.length() for s in sequences])
16 16
 
17 17
 print(f"Actual counted: {actual_counted}")
18 18
 
19
-holes = find_holes(filtered)
19
+merged = fh.merge_duplicates(sequences)
20
+merged = [s for s in merged if s.length() > 1]
21
+holes = fh.find_holes(filtered)
20 22
 
21 23
 print(len(holes))
22 24
 
@@ -25,13 +27,11 @@ for hole in holes:
25 27
 
26 28
 
27 29
 # lets export a csv file of the holes and the people responsible for them
28
-with open('output/holes.csv', 'w') as f:
29
-    f.write('début,fin,taille,responsable1,responsable2,date1,date2\n')
30
+with open("output/holes.csv", "w") as f:
31
+    f.write("début,fin,taille,responsable1,responsable2,date1,date2\n")
30 32
     for hole in holes:
31
-        date_start = datetime.utcfromtimestamp(
32
-            hole.start_message.timestamp_ms / 1000.0).strftime('%Y-%m-%d %H:%M:%S')
33
-        date_end = datetime.utcfromtimestamp(
34
-            hole.end_message.timestamp_ms / 1000.0).strftime('%Y-%m-%d %H:%M:%S')
33
+        date_start = hole.start_message.date_time.strftime("%Y-%m-%d %H:%M:%S")
34
+        date_end = hole.end_message.date_time.strftime("%Y-%m-%d %H:%M:%S")
35 35
         f.write(
36 36
             f"{hole.start()},"
37 37
             f"{hole.end()},"

+ 25
- 0
scripts/find_missing.py View File

@@ -0,0 +1,25 @@
1
+import million.analyze.message_evaluation as msg_val
2
+import million.parse.fb_exports as fb
3
+import time
4
+
5
+export = fb.parse_dirfiles("./data")
6
+messages = export.messages
7
+
8
+counts = {val for m in messages if (val := msg_val.get(m)) and val <= 1_000_000}
9
+counts = sorted(counts)
10
+
11
+expected_value = 1
12
+intervals = []
13
+
14
+for value in counts:
15
+    if value != expected_value:
16
+        interval_length = value - expected_value
17
+
18
+        if interval_length == 1:
19
+            intervals.append(str(expected_value))
20
+        else:
21
+            intervals.append(f"{expected_value}..{value - 1}")
22
+
23
+    expected_value = value + 1
24
+
25
+print(intervals)

+ 4
- 9
scripts/read_top.py View File

@@ -1,21 +1,16 @@
1
-from million.view.bar_chart import plot as bar_chart
2 1
 from million.analyze.count_participations import count_participations
3
-from million.analyze.retain_counts import retain_counts
2
+from million.analyze.word_finder import retain_counts
4 3
 import million.parse.fb_exports as fb
5 4
 
6 5
 
7 6
 DATA_PATH = './data/'
8 7
 
9 8
 export = fb.parse_dirfiles(DATA_PATH)
10
-
11 9
 filtered = retain_counts(export.messages)
12 10
 
13 11
 print(len(filtered))
14 12
 
15
-counted_participations = count_participations(filtered, export.participants)
16
-
17
-kept_participations = [
18
-    p for p in counted_participations if p['participations'] > 100]
13
+participations = count_participations(filtered, export.participants, 100)
19 14
 
20
-print("\n".join(
21
-    [f"{p['name']}: {p['participations']}" for p in kept_participations]))
15
+for name, count in participations.items():
16
+    print(f"{name}: {count}")

+ 34
- 0
scripts/test_count_analysis.py View File

@@ -0,0 +1,34 @@
1
+import million.parse.fb_exports as fb
2
+import million.analyze.message_evaluation as msg_val
3
+from million.analyze.count_analysis import  search_value_at
4
+
5
+
6
+DATA_PATH = "./data/"
7
+export = fb.parse_dirfiles(DATA_PATH)
8
+messages = export.messages
9
+
10
+expected = 0
11
+idx = 0
12
+total_len = len(messages)
13
+total_as_percent = 100 / total_len
14
+
15
+with open('output/analysis_breakdown.txt', 'w', encoding="utf-8") as fichier:
16
+    while idx < total_len:
17
+        print(f"\r{round(idx * total_as_percent, 1)}%", end="")
18
+
19
+        # skip messages with no detected value
20
+        if msg_val.get(messages[idx]) == None:
21
+            idx += 1
22
+            continue
23
+
24
+        expected += 1
25
+
26
+        found_index = search_value_at(messages, idx, expected)
27
+        
28
+        if found_index:
29
+            fichier.write(f"{expected}\t⇒{messages[found_index]}\n")
30
+            idx = found_index + 1
31
+        else:
32
+            fichier.write(f"{expected}[X]\t⇒{messages[idx]}\n")
33
+
34
+print("\nComplete analysis in: output/analysis_breakdown.txt")

+ 44
- 12
test/model/message_test.py View File

@@ -1,28 +1,60 @@
1
-
2
-
3
-from million.model.message import Message
4 1
 import million.analyze.message_evaluation as msg_val
5 2
 from test.TestCase import TestCase
6 3
 
7 4
 
8 5
 class MessageTest(TestCase):
9 6
 
10
-    def test_message_nominal(self, overrides=None, exclude=None):
7
+    def test_single_digit(self, overrides=None, exclude=None):
11 8
         message = self._message_with_text("1")
12 9
 
13 10
         assert 1 == msg_val.get(message)
14 11
 
12
+    def test_nothing(self, overrides=None, exclude=None):
13
+        message = self._message_with_text("")
14
+
15
+        assert None == msg_val.get(message)
16
+
17
+    def test_message_nominal(self, overrides=None, exclude=None):
18
+        message = self._message_with_text("1234")
19
+
20
+        assert 1234 == msg_val.get(message)
21
+
15 22
     def test_message_with_text(self, overrides=None, exclude=None):
16
-        message = self._message_with_text("1 text")
23
+        message = self._message_with_text("... 😏😏 269")
17 24
 
18
-        assert 1 == msg_val.get(message)
25
+        assert 269 == msg_val.get(message)
19 26
 
20
-    def test_message_floored_dot(self, overrides=None, exclude=None):
21
-        message = self._message_with_text("1.5")
27
+    def test_message_with_text_2(self, overrides=None, exclude=None):
28
+        message = self._message_with_text("331 allez la")
22 29
 
23
-        assert 1 == msg_val.get(message)
30
+        assert 331 == msg_val.get(message)
24 31
 
25
-    def test_message_floored_comma(self, overrides=None, exclude=None):
26
-        message = self._message_with_text("1,5")
32
+    def test_message_with_text_3(self, overrides=None, exclude=None):
33
+        message = self._message_with_text("Ok 2160")
27 34
 
28
-        assert 1 == msg_val.get(message)
35
+        assert 2160 == msg_val.get(message)
36
+
37
+    def test_message_value_cut(self, overrides=None, exclude=None):
38
+        message = self._message_with_text("66...😏😏😏9")
39
+
40
+        assert 669 == msg_val.get(message)
41
+
42
+    def test_message_value_cut_2(self, overrides=None, exclude=None):
43
+        message = self._message_with_text("82heyyyyyy69")
44
+
45
+        assert 8269 == msg_val.get(message)
46
+
47
+    def test_message_value_cut_2(self, overrides=None, exclude=None):
48
+        message = self._message_with_text("9339 9339 9339 9339")
49
+
50
+        assert 9339 == msg_val.get(message)
51
+
52
+    def test_message_in_middle(self, overrides=None, exclude=None):
53
+        message = self._message_with_text("A peine 5565 ouais...")
54
+
55
+        assert 5565 == msg_val.get(message)
56
+
57
+    def test_message_float_1(self, overrides=None, exclude=None):
58
+        message = self._message_with_text("11111,1111111111111111¼")
59
+
60
+        assert 11111 == msg_val.get(message)

+ 76
- 0
test/model/typo_test.py View File

@@ -0,0 +1,76 @@
1
+import million.analyze.count_analysis as ca
2
+from test.TestCase import TestCase
3
+
4
+
5
+class TypoTest(TestCase):
6
+
7
+    def test_missing_letter_1(self, overrides=None, exclude=None):
8
+        assert ca.check_extra_or_missing_letter("4976", "45976") == True
9
+
10
+    def test_missing_letter_2(self, overrides=None, exclude=None):
11
+        assert ca.check_extra_or_missing_letter("4596", "45976") == True
12
+
13
+    def test_missing_letter_3(self, overrides=None, exclude=None):
14
+        assert ca.check_extra_or_missing_letter("5976", "45976") == True
15
+
16
+    def test_missing_letter_4(self, overrides=None, exclude=None):
17
+        assert ca.check_extra_or_missing_letter("4597", "45976") == True
18
+
19
+
20
+    
21
+    def test_extra_letter_1(self, overrides=None, exclude=None):
22
+        assert ca.check_extra_or_missing_letter("459766", "45976") == True
23
+
24
+    def test_extra_letter_2(self, overrides=None, exclude=None):
25
+        assert ca.check_extra_or_missing_letter("545976", "45976") == True
26
+
27
+    def test_extra_letter_3(self, overrides=None, exclude=None):
28
+        assert ca.check_extra_or_missing_letter("452976", "45976") == True
29
+
30
+    def test_extra_letter_4(self, overrides=None, exclude=None):
31
+        assert ca.check_extra_or_missing_letter("459776", "45976") == True
32
+
33
+    def test_extra_letter_5(self, overrides=None, exclude=None):
34
+        assert ca.check_extra_or_missing_letter("45976", "45976") == False
35
+
36
+
37
+    def test_single_letter_differ_1(self, overrides=None, exclude=None):
38
+        assert ca.check_single_letter_differ("35976", "45976") == True
39
+
40
+    def test_single_letter_differ_2(self, overrides=None, exclude=None):
41
+        assert ca.check_single_letter_differ("45986", "45976") == True
42
+
43
+    def test_single_letter_differ_3(self, overrides=None, exclude=None):
44
+        assert ca.check_single_letter_differ("44986", "45976") == False
45
+
46
+    def test_single_letter_differ_4(self, overrides=None, exclude=None):
47
+        assert ca.check_single_letter_differ("35975", "45976") == False
48
+
49
+    def test_single_letter_differ_5(self, overrides=None, exclude=None):
50
+        assert ca.check_single_letter_differ("4976", "45976") == False
51
+
52
+    def test_single_letter_differ_6(self, overrides=None, exclude=None):
53
+        assert ca.check_single_letter_differ("4597", "45976") == False
54
+        
55
+    def test_single_letter_differ_7(self, overrides=None, exclude=None):
56
+        assert ca.check_single_letter_differ("45976", "45976") == False
57
+
58
+
59
+    
60
+    def test_letter_swap_1(self, overrides=None, exclude=None):
61
+        assert ca.check_letter_swap("45976", "45976") == False
62
+
63
+    def test_letter_swap_2(self, overrides=None, exclude=None):
64
+        assert ca.check_letter_swap("49576", "45976") == True
65
+
66
+    def test_letter_swap_3(self, overrides=None, exclude=None):
67
+        assert ca.check_letter_swap("45967", "45976") == True
68
+
69
+    def test_letter_swap_4(self, overrides=None, exclude=None):
70
+        assert ca.check_letter_swap("47956", "45976") == False
71
+
72
+    def test_letter_swap_5(self, overrides=None, exclude=None):
73
+        assert ca.check_letter_swap("54966", "45976") == False
74
+
75
+    def test_letter_swap_6(self, overrides=None, exclude=None):
76
+        assert ca.check_letter_swap("54967", "45976") == False

Loading…
Cancel
Save