14 коммитов

Автор SHA1 Сообщение Дата
  Figg 7f1705fdaf minor fixes, comments 9 месяцев назад
  Figg 9ff1887fd2 Renaming DNS into MediaCountMapper 9 месяцев назад
  Figg 596bf86884 enrichissement des méthodes d'analyse de comptage 9 месяцев назад
  Figg 0741257e96 more message testing 9 месяцев назад
  Figg bd353eb3ff final dns file 9 месяцев назад
  Figg 8e74ec6a91 Fix count_analysis 9 месяцев назад
  Figg 2199dd99cb print message now display media trace 9 месяцев назад
  Figg 4a4a13a3dc Initial DNS file 9 месяцев назад
  Figg fadff8c69a Content value now extracted differently 9 месяцев назад
  Figg c01b29a710 more test messages + test typo checking 9 месяцев назад
  Figg 853dba4052 Y avait un warning au build 9 месяцев назад
  Figg 1d5ec25932 Merge branch 'DEV-Mael' into feature/find_holes 9 месяцев назад
  Figg bf7bf3fee7 déplacer le dns dans data 9 месяцев назад
  Figg 391fae71f3 nouveaux scripts pour chercher les valeurs manquantes 9 месяцев назад

+ 0
- 1
DNS Просмотреть файл

@@ -1 +0,0 @@
1
-https://www.youtube.com/watch?v=mC9yute2k_Q 3000

+ 45
- 0
data/DefaultMediaCountMapFile Просмотреть файл

@@ -0,0 +1,45 @@
1
+94965738_575896433034148_2204307637284110336_n_2600568116938585.jpg 300
2
+84316189_161561335269481_4671060857508069376_n_821996424998042.jpg 307
3
+102407032_3165952896761223_6148002473225081360_n_2378777012422623.jpg 308
4
+90507136_213915209958557_5413143962586185728_n_1841388439336535.jpg 309
5
+104035858_782158855522425_8192024435259743235_n_782158852189092.jpg 666
6
+104123574_322212138769518_5707692183879973515_n_322212135436185.jpg 667
7
+104434027_271819697398827_5391503491326101448_n_271819694065494.jpg 1312
8
+95525936_1164142403920772_8318302524985573376_n_299590661169172.jpg 51
9
+52421236_278953813000944_2106293885134176256_n_2905362479561945.gif 1664
10
+20688788_286417625173756_1069917705378725888_n_1207474276261488.gif 666
11
+38234456_1103482206482169_3464478929153163264_n_2628411810759247.gif 1789
12
+104662968_217216135921364_332403069450983046_n_217216132588031.jpg 1914
13
+50706165_602050206907030_9269729130708992_n_3180220838709531.gif 1939
14
+92978153_661083471350851_7802534939089436672_n_585129509051794.jpg 1984
15
+104872794_272192197333677_7875491468143786127_n_272192194000344.jpg 1995
16
+49627753_540217636460822_4914566064169287680_n_2394672374158902.png 2048
17
+https://www.youtube.com/watch?v=mC9yute2k_Q 3000
18
+65681880_655872084893722_5358758350790066176_n_4051854338219896.gif 3666
19
+50165487_1987596788210079_254230440078999552_n_1007765122959718.gif 66
20
+87358105_203182734373455_1323650921388834816_n_3112616325440519.gif 4810
21
+83527853_509829533251553_144101650338938880_n_1220206091644612.gif 6369
22
+84441501_209962830394148_963121690001276928_n_736879910407241.gif 6769
23
+74608941_770967279996317_3169876449326792704_n_1581830875345515.gif 7269
24
+20505423_878434838980511_4604695143109361664_n_299194367865591.gif 666
25
+110264758_573811796634371_8422456995004556652_n_782140962427058.gif 666
26
+65182313_697973310662653_2741056482018590720_n_110058701154052.gif 666
27
+120437981_961865127657562_2352191202134666388_n_297241988832056.gif 17000
28
+133574591_2903570269883547_4546172544540158465_n_1987002451464904.gif 18000
29
+124066484_677676816444243_7811409333876486154_n_382212936779839.gif 20000
30
+130166493_156814772857168_4400190561706308563_n_1434915410205159.gif 21000
31
+131881117_200208011820369_5496884526316665472_n_614869492871247.gif 24000
32
+122477452_404917850669181_7425532495902993743_n_622781605380862.gif 25000
33
+60398112_324025954936328_3959780282919288832_n_1298063493943852.gif 26400
34
+132605238_2250185215114171_4387582615384925988_n_1562306254123019.gif 29000
35
+83715267_525261428388982_9213116445225910272_n_408695157299629.gif 30000
36
+223698887_1438459123189233_486429511094530589_n_2894947440818658.gif 40000
37
+245828092_2845723112338892_4090190909716007091_n_2845723109005559.jpg 36399
38
+246367043_846550806036575_350641140426701499_n_846550802703242.jpg 36400
39
+247417430_407831760973532_6702356361214642186_n_407831757640199.jpg 36401
40
+274103826_545381123173322_5027057711080616063_n_545381116506656.jpg 60909
41
+273830541_343104791045273_1854911206287093351_n_343104784378607.jpg 60910
42
+273907776_234819785533495_1080142729732940044_n_234819782200162.jpg 60911
43
+274242881_999574723977290_4022657018268260987_n_999574720643957.jpg 60912
44
+274008762_640199633904372_3459422682721277586_n_640199623904373.jpg 60913
45
+audioclip16419453900003855_1094480794455461.mp4 57612

+ 132
- 0
million/analyze/count_analysis.py Просмотреть файл

@@ -0,0 +1,132 @@
1
+from typing import List
2
+import million.analyze.message_evaluation as msg_val
3
+from million.model.message import Message
4
+
5
+
6
+def check_extra_or_missing_letter(word: str, reference: str) -> bool:
7
+    """
8
+    Cette méthode vérifie si la str word contient une et une seule lettre
9
+    de trop ou de moins par rapport à la str reference
10
+    """
11
+    len_word = len(word)
12
+    len_ref = len(reference)
13
+
14
+    if abs(len_word - len_ref) != 1:
15
+        return False
16
+
17
+    shortest = word if len_word < len_ref else reference
18
+    longest = word if len_word > len_ref else reference
19
+
20
+    for i in range(len(shortest)):
21
+        if shortest[i] != longest[i]:
22
+            return shortest[i:] == longest[i + 1 :]
23
+
24
+    return True
25
+
26
+
27
+def check_single_letter_differ(word: str, reference: str) -> bool:
28
+    """
29
+    Cette méthode vérifie si la str word contient une et une seule
30
+    lettre différente par rapport à la str reference
31
+    """
32
+    return sum(1 for x, y in zip(reference, word) if x != y) == 1
33
+
34
+
35
+def check_letter_swap(word: str, reference: str) -> bool:
36
+    """
37
+    Cette méthode vérifie si la str word contient un et un seul
38
+    échange de lettres consécutives par rapport à la str reference
39
+    """
40
+    if len(word) != len(reference):
41
+        return False
42
+
43
+    for i in range(len(word) - 1):
44
+        if word[i] != reference[i]:
45
+            return word[i + 1] + word[i] + word[i + 2 :] == reference[i:]
46
+
47
+    return False
48
+
49
+
50
+def check_typo(word: str, reference: str) -> bool:
51
+    """
52
+    Cette méthode vérifie si la str word contient une typo en se référant à la str reference
53
+    """
54
+    if len(reference) == len(word):
55
+        return check_single_letter_differ(word, reference) or check_letter_swap(
56
+            word, reference
57
+        )
58
+    else:
59
+        return check_extra_or_missing_letter(word, reference)
60
+
61
+
62
+def _check_message_concatenation(messages: List[Message], index: int, expected: int) -> bool:
63
+    """
64
+    Cette méthode détermine si la liste messages contient le compte expected à partir de l'index donné
65
+    en concaténant les valeurs des messages suivants.
66
+    Cette méthode permet de trouver un compte qui a été étalé sur plusieurs messages
67
+    """
68
+    reference = str(expected)
69
+    testing = ""
70
+
71
+    offset = 0
72
+
73
+    while len(testing) < len(reference):
74
+        next_message = messages[index + offset]
75
+        offset += 1    
76
+        if next_message.sender_name == messages[index].sender_name:
77
+            testing += str(msg_val.get(next_message))
78
+
79
+    return testing == reference
80
+
81
+
82
+def _heavy_check(messages: List[Message], index: int, expected: int) -> bool:
83
+    """
84
+    Cette méthode détermine si la liste messages contient le compte expected à partir de l'index donné.
85
+    Elle utilise pour cela des méthodes complexes qui ne permettent de trouver un résultat
86
+    seulement si on est sortis du cas nominal
87
+    """
88
+    # TODO
89
+    #   - créer une méthode pour gérer le cas où plusieurs comptages sont contenus dans le même corps de message
90
+    #   - créer une méthode pour le cas où les chiffres sont représentés par un substitut au sein du corps du message
91
+    #     i.e. un nombre écrit en toutes lettres (français ou breton), 🍁 pour 420, @Elias Cheddar pour 69
92
+    m = messages[index]
93
+    word = str(msg_val.get(m))
94
+
95
+    return _check_message_concatenation(messages, index, expected) or \
96
+        check_typo(word, str(expected)) and msg_val.get(messages[index+1]) == expected+1
97
+
98
+
99
+def _check_value_around(messages, index, expected, amplitude_after, amplitude_before):
100
+    for i in range(1, amplitude_after + 1):
101
+        if index + i < len(messages) and expected == msg_val.get(messages[index + i]):
102
+            return index + i
103
+    for i in range(1, amplitude_before + 1):
104
+        if expected == msg_val.get(messages[index - i]):
105
+            return index - i
106
+
107
+    return None
108
+
109
+
110
+def search_value_at(messages, index, expected, do_heavy_check=True, amplitude_after=1000, amplitude_before=10):
111
+    """
112
+    Cette méthode détermine si la liste messages contient le compte expected à partir de l'index donné.
113
+    Le paramètre amplitude détermine la plage où effectuer les recherches autour de l'index donné.
114
+    Le paramètre do_heavy_check précise si on doit pousser l'analyse avec des méthodes plus lourdes en cas d'échec
115
+    """
116
+    # Si le message courant contient la valeur, on renvoie
117
+    curr_value = msg_val.get(messages[index])
118
+    if expected == curr_value:
119
+        return index
120
+
121
+    # Sinon on regarde aux alentours
122
+    jump_index = _check_value_around(messages, index, expected, amplitude_after, amplitude_before)
123
+    if jump_index is not None:
124
+        return jump_index
125
+
126
+    # Enfin, si on ne trouve pas la valeur à l'index donné et dans l'amplitude donnée
127
+    # On performe une vérification lourde à cet endroit
128
+    if do_heavy_check and _heavy_check(messages, index, expected):
129
+        return index
130
+
131
+    # Si tout cela n'a rien donné, on renvoie None
132
+    return None

million/analyze/dns_solver.py → million/analyze/media_count_mapper.py Просмотреть файл

@@ -1,11 +1,12 @@
1
+from os.path import basename
1 2
 from typing import Dict
2 3
 
3 4
 from pydantic import BaseModel, PrivateAttr
4 5
 from million.model.message import Message
5 6
 
6
-_default_file_path = './DNS'
7
+_default_file_path = 'data/DefaultMediaCountMapFile'
7 8
 
8
-class DNS_solver(BaseModel):
9
+class MediaCountMapper(BaseModel):
9 10
     file_path:str = _default_file_path
10 11
 
11 12
     _bank: Dict[str, int] | None = PrivateAttr(None)
@@ -33,4 +34,5 @@ class DNS_solver(BaseModel):
33 34
         # look into msg attributes
34 35
         # find uri
35 36
         return (msg.share or None) and msg.share.link or \
36
-            (msg.gifs or None) and msg.gifs[0].uri
37
+            (msg.gifs or None) and basename(msg.gifs[0].uri) or \
38
+            (msg.photos or None) and basename(msg.photos[0].uri)

+ 15
- 12
million/analyze/message_evaluation.py Просмотреть файл

@@ -1,11 +1,12 @@
1 1
 import re
2 2
 from typing import Dict
3 3
 from million.model.message import Message
4
-import million.analyze.dns_solver as dns
4
+import million.analyze.media_count_mapper as mcm
5 5
 
6 6
 
7 7
 _memoization: Dict[Message, int] = {}
8
-_dns_solver: dns.DNS_solver = dns.DNS_solver()
8
+_dns_solver: mcm.MediaCountMapper = mcm.MediaCountMapper()
9
+
9 10
 
10 11
 def get(msg: Message) -> int:
11 12
     """
@@ -13,6 +14,7 @@ def get(msg: Message) -> int:
13 14
     """
14 15
     return _memoization.get(msg, _compute(msg))
15 16
 
17
+
16 18
 def reset(msg: Message) -> None:
17 19
     """
18 20
     Drop memorized value of this Message
@@ -20,6 +22,7 @@ def reset(msg: Message) -> None:
20 22
     if msg in _memoization:
21 23
         _memoization.pop(msg)
22 24
 
25
+
23 26
 def reset() -> None:
24 27
     """
25 28
     Drop every memorized message value
@@ -28,20 +31,20 @@ def reset() -> None:
28 31
 
29 32
 
30 33
 def _compute(msg: Message) -> int:
31
-    value = _dns_solver.solve(msg) or \
32
-        _computeContent(msg) or \
33
-        None
34
+    value = _dns_solver.solve(msg) or _computeContent(msg) or None
34 35
 
35 36
     _memoization[msg] = value
36 37
     return value
37 38
 
39
+
38 40
 def _computeContent(msg: Message) -> int:
39
-    # TODO parse potential math expressions in content
40
-    match = msg.content and re.search(r"\d+", msg.content)
41
+    if not msg.content:
42
+        return
41 43
     
44
+    s = re.sub(r'[^\s\d.,]|[.,]{2,}',"", msg.content)
45
+    match = re.search(r"\d+", s)
46
+
42 47
     if match:
43
-        value = int(match.group())
44
-    else:
45
-        value = None
46
-    
47
-    return value
48
+        return int(match.group())
49
+
50
+    return None

+ 15
- 3
million/model/message.py Просмотреть файл

@@ -1,7 +1,7 @@
1 1
 from datetime import datetime
2 2
 from typing import Any, List
3 3
 from uuid import uuid4
4
-from pydantic import BaseModel, Field, PrivateAttr, computed_field, validator
4
+from pydantic import BaseModel, Field, PrivateAttr, computed_field, field_validator
5 5
 
6 6
 
7 7
 class Reaction(BaseModel):
@@ -57,7 +57,19 @@ class Message(BaseModel):
57 57
 
58 58
     def __str__(self) -> str:
59 59
         dt_str = self.date_time.strftime("%d/%m/%Y, %H:%M:%S")
60
-        return f"{self.sender_name}({dt_str}) : {self.content}"
60
+
61
+        msg_str = f"{self.sender_name}({dt_str})"
62
+
63
+        if self.content:
64
+            msg_str += " : " + self.content
65
+        if self.photos:
66
+            msg_str += f" [PHOTOS {len(self.photos)}]"
67
+        if self.videos:
68
+            msg_str += f" [VIDEOS {len(self.videos)}]"
69
+        if self.gifs:
70
+            msg_str += f" [GIFS {len(self.gifs)}]"
71
+
72
+        return msg_str
61 73
 
62 74
     def __hash__(self) -> int:
63 75
         return hash(self.item_id)
@@ -67,7 +79,7 @@ class Message(BaseModel):
67 79
     def item_id(self) -> str:
68 80
         return self._id
69 81
 
70
-    @validator("date_time", pre=True, always=True)
82
+    @field_validator("date_time")
71 83
     def parse_timestamp(cls, v):
72 84
         if isinstance(v, int):
73 85
             return datetime.fromtimestamp(v / 1000)

+ 25
- 0
scripts/find_missing.py Просмотреть файл

@@ -0,0 +1,25 @@
1
+import million.analyze.message_evaluation as msg_val
2
+import million.parse.fb_exports as fb
3
+import time
4
+
5
+export = fb.parse_dirfiles("./data")
6
+messages = export.messages
7
+
8
+counts = {val for m in messages if (val := msg_val.get(m)) and val <= 1_000_000}
9
+counts = sorted(counts)
10
+
11
+expected_value = 1
12
+intervals = []
13
+
14
+for value in counts:
15
+    if value != expected_value:
16
+        interval_length = value - expected_value
17
+
18
+        if interval_length == 1:
19
+            intervals.append(str(expected_value))
20
+        else:
21
+            intervals.append(f"{expected_value}..{value - 1}")
22
+
23
+    expected_value = value + 1
24
+
25
+print(intervals)

+ 34
- 0
scripts/test_count_analysis.py Просмотреть файл

@@ -0,0 +1,34 @@
1
+import million.parse.fb_exports as fb
2
+import million.analyze.message_evaluation as msg_val
3
+from million.analyze.count_analysis import  search_value_at
4
+
5
+
6
+DATA_PATH = "./data/"
7
+export = fb.parse_dirfiles(DATA_PATH)
8
+messages = export.messages
9
+
10
+expected = 0
11
+idx = 0
12
+total_len = len(messages)
13
+total_as_percent = 100 / total_len
14
+
15
+with open('output/analysis_breakdown.txt', 'w', encoding="utf-8") as fichier:
16
+    while idx < total_len:
17
+        print(f"\r{round(idx * total_as_percent, 1)}%", end="")
18
+
19
+        # skip messages with no detected value
20
+        if msg_val.get(messages[idx]) == None:
21
+            idx += 1
22
+            continue
23
+
24
+        expected += 1
25
+
26
+        found_index = search_value_at(messages, idx, expected)
27
+        
28
+        if found_index:
29
+            fichier.write(f"{expected}\t⇒{messages[found_index]}\n")
30
+            idx = found_index + 1
31
+        else:
32
+            fichier.write(f"{expected}[X]\t⇒{messages[idx]}\n")
33
+
34
+print("\nComplete analysis in: output/analysis_breakdown.txt")

+ 44
- 12
test/model/message_test.py Просмотреть файл

@@ -1,28 +1,60 @@
1
-
2
-
3
-from million.model.message import Message
4 1
 import million.analyze.message_evaluation as msg_val
5 2
 from test.TestCase import TestCase
6 3
 
7 4
 
8 5
 class MessageTest(TestCase):
9 6
 
10
-    def test_message_nominal(self, overrides=None, exclude=None):
7
+    def test_single_digit(self, overrides=None, exclude=None):
11 8
         message = self._message_with_text("1")
12 9
 
13 10
         assert 1 == msg_val.get(message)
14 11
 
12
+    def test_nothing(self, overrides=None, exclude=None):
13
+        message = self._message_with_text("")
14
+
15
+        assert None == msg_val.get(message)
16
+
17
+    def test_message_nominal(self, overrides=None, exclude=None):
18
+        message = self._message_with_text("1234")
19
+
20
+        assert 1234 == msg_val.get(message)
21
+
15 22
     def test_message_with_text(self, overrides=None, exclude=None):
16
-        message = self._message_with_text("1 text")
23
+        message = self._message_with_text("... 😏😏 269")
17 24
 
18
-        assert 1 == msg_val.get(message)
25
+        assert 269 == msg_val.get(message)
19 26
 
20
-    def test_message_floored_dot(self, overrides=None, exclude=None):
21
-        message = self._message_with_text("1.5")
27
+    def test_message_with_text_2(self, overrides=None, exclude=None):
28
+        message = self._message_with_text("331 allez la")
22 29
 
23
-        assert 1 == msg_val.get(message)
30
+        assert 331 == msg_val.get(message)
24 31
 
25
-    def test_message_floored_comma(self, overrides=None, exclude=None):
26
-        message = self._message_with_text("1,5")
32
+    def test_message_with_text_3(self, overrides=None, exclude=None):
33
+        message = self._message_with_text("Ok 2160")
27 34
 
28
-        assert 1 == msg_val.get(message)
35
+        assert 2160 == msg_val.get(message)
36
+
37
+    def test_message_value_cut(self, overrides=None, exclude=None):
38
+        message = self._message_with_text("66...😏😏😏9")
39
+
40
+        assert 669 == msg_val.get(message)
41
+
42
+    def test_message_value_cut_2(self, overrides=None, exclude=None):
43
+        message = self._message_with_text("82heyyyyyy69")
44
+
45
+        assert 8269 == msg_val.get(message)
46
+
47
+    def test_message_value_cut_2(self, overrides=None, exclude=None):
48
+        message = self._message_with_text("9339 9339 9339 9339")
49
+
50
+        assert 9339 == msg_val.get(message)
51
+
52
+    def test_message_in_middle(self, overrides=None, exclude=None):
53
+        message = self._message_with_text("A peine 5565 ouais...")
54
+
55
+        assert 5565 == msg_val.get(message)
56
+
57
+    def test_message_float_1(self, overrides=None, exclude=None):
58
+        message = self._message_with_text("11111,1111111111111111¼")
59
+
60
+        assert 11111 == msg_val.get(message)

+ 76
- 0
test/model/typo_test.py Просмотреть файл

@@ -0,0 +1,76 @@
1
+import million.analyze.count_analysis as ca
2
+from test.TestCase import TestCase
3
+
4
+
5
+class TypoTest(TestCase):
6
+
7
+    def test_missing_letter_1(self, overrides=None, exclude=None):
8
+        assert ca.check_extra_or_missing_letter("4976", "45976") == True
9
+
10
+    def test_missing_letter_2(self, overrides=None, exclude=None):
11
+        assert ca.check_extra_or_missing_letter("4596", "45976") == True
12
+
13
+    def test_missing_letter_3(self, overrides=None, exclude=None):
14
+        assert ca.check_extra_or_missing_letter("5976", "45976") == True
15
+
16
+    def test_missing_letter_4(self, overrides=None, exclude=None):
17
+        assert ca.check_extra_or_missing_letter("4597", "45976") == True
18
+
19
+
20
+    
21
+    def test_extra_letter_1(self, overrides=None, exclude=None):
22
+        assert ca.check_extra_or_missing_letter("459766", "45976") == True
23
+
24
+    def test_extra_letter_2(self, overrides=None, exclude=None):
25
+        assert ca.check_extra_or_missing_letter("545976", "45976") == True
26
+
27
+    def test_extra_letter_3(self, overrides=None, exclude=None):
28
+        assert ca.check_extra_or_missing_letter("452976", "45976") == True
29
+
30
+    def test_extra_letter_4(self, overrides=None, exclude=None):
31
+        assert ca.check_extra_or_missing_letter("459776", "45976") == True
32
+
33
+    def test_extra_letter_5(self, overrides=None, exclude=None):
34
+        assert ca.check_extra_or_missing_letter("45976", "45976") == False
35
+
36
+
37
+    def test_single_letter_differ_1(self, overrides=None, exclude=None):
38
+        assert ca.check_single_letter_differ("35976", "45976") == True
39
+
40
+    def test_single_letter_differ_2(self, overrides=None, exclude=None):
41
+        assert ca.check_single_letter_differ("45986", "45976") == True
42
+
43
+    def test_single_letter_differ_3(self, overrides=None, exclude=None):
44
+        assert ca.check_single_letter_differ("44986", "45976") == False
45
+
46
+    def test_single_letter_differ_4(self, overrides=None, exclude=None):
47
+        assert ca.check_single_letter_differ("35975", "45976") == False
48
+
49
+    def test_single_letter_differ_5(self, overrides=None, exclude=None):
50
+        assert ca.check_single_letter_differ("4976", "45976") == False
51
+
52
+    def test_single_letter_differ_6(self, overrides=None, exclude=None):
53
+        assert ca.check_single_letter_differ("4597", "45976") == False
54
+        
55
+    def test_single_letter_differ_7(self, overrides=None, exclude=None):
56
+        assert ca.check_single_letter_differ("45976", "45976") == False
57
+
58
+
59
+    
60
+    def test_letter_swap_1(self, overrides=None, exclude=None):
61
+        assert ca.check_letter_swap("45976", "45976") == False
62
+
63
+    def test_letter_swap_2(self, overrides=None, exclude=None):
64
+        assert ca.check_letter_swap("49576", "45976") == True
65
+
66
+    def test_letter_swap_3(self, overrides=None, exclude=None):
67
+        assert ca.check_letter_swap("45967", "45976") == True
68
+
69
+    def test_letter_swap_4(self, overrides=None, exclude=None):
70
+        assert ca.check_letter_swap("47956", "45976") == False
71
+
72
+    def test_letter_swap_5(self, overrides=None, exclude=None):
73
+        assert ca.check_letter_swap("54966", "45976") == False
74
+
75
+    def test_letter_swap_6(self, overrides=None, exclude=None):
76
+        assert ca.check_letter_swap("54967", "45976") == False

Загрузка…
Отмена
Сохранить