14 コミット

作成者 SHA1 メッセージ 日付
  Figg 7f1705fdaf minor fixes, comments 6ヶ月前
  Figg 9ff1887fd2 Renaming DNS into MediaCountMapper 6ヶ月前
  Figg 596bf86884 enrichissement des méthodes d'analyse de comptage 6ヶ月前
  Figg 0741257e96 more message testing 6ヶ月前
  Figg bd353eb3ff final dns file 6ヶ月前
  Figg 8e74ec6a91 Fix count_analysis 6ヶ月前
  Figg 2199dd99cb print message now display media trace 6ヶ月前
  Figg 4a4a13a3dc Initial DNS file 6ヶ月前
  Figg fadff8c69a Content value now extracted differently 6ヶ月前
  Figg c01b29a710 more test messages + test typo checking 6ヶ月前
  Figg 853dba4052 Y avait un warning au build 6ヶ月前
  Figg 1d5ec25932 Merge branch 'DEV-Mael' into feature/find_holes 6ヶ月前
  Figg bf7bf3fee7 déplacer le dns dans data 6ヶ月前
  Figg 391fae71f3 nouveaux scripts pour chercher les valeurs manquantes 6ヶ月前

+ 0
- 1
DNS ファイルの表示

@@ -1 +0,0 @@
1
-https://www.youtube.com/watch?v=mC9yute2k_Q 3000

+ 45
- 0
data/DefaultMediaCountMapFile ファイルの表示

@@ -0,0 +1,45 @@
1
+94965738_575896433034148_2204307637284110336_n_2600568116938585.jpg 300
2
+84316189_161561335269481_4671060857508069376_n_821996424998042.jpg 307
3
+102407032_3165952896761223_6148002473225081360_n_2378777012422623.jpg 308
4
+90507136_213915209958557_5413143962586185728_n_1841388439336535.jpg 309
5
+104035858_782158855522425_8192024435259743235_n_782158852189092.jpg 666
6
+104123574_322212138769518_5707692183879973515_n_322212135436185.jpg 667
7
+104434027_271819697398827_5391503491326101448_n_271819694065494.jpg 1312
8
+95525936_1164142403920772_8318302524985573376_n_299590661169172.jpg 51
9
+52421236_278953813000944_2106293885134176256_n_2905362479561945.gif 1664
10
+20688788_286417625173756_1069917705378725888_n_1207474276261488.gif 666
11
+38234456_1103482206482169_3464478929153163264_n_2628411810759247.gif 1789
12
+104662968_217216135921364_332403069450983046_n_217216132588031.jpg 1914
13
+50706165_602050206907030_9269729130708992_n_3180220838709531.gif 1939
14
+92978153_661083471350851_7802534939089436672_n_585129509051794.jpg 1984
15
+104872794_272192197333677_7875491468143786127_n_272192194000344.jpg 1995
16
+49627753_540217636460822_4914566064169287680_n_2394672374158902.png 2048
17
+https://www.youtube.com/watch?v=mC9yute2k_Q 3000
18
+65681880_655872084893722_5358758350790066176_n_4051854338219896.gif 3666
19
+50165487_1987596788210079_254230440078999552_n_1007765122959718.gif 66
20
+87358105_203182734373455_1323650921388834816_n_3112616325440519.gif 4810
21
+83527853_509829533251553_144101650338938880_n_1220206091644612.gif 6369
22
+84441501_209962830394148_963121690001276928_n_736879910407241.gif 6769
23
+74608941_770967279996317_3169876449326792704_n_1581830875345515.gif 7269
24
+20505423_878434838980511_4604695143109361664_n_299194367865591.gif 666
25
+110264758_573811796634371_8422456995004556652_n_782140962427058.gif 666
26
+65182313_697973310662653_2741056482018590720_n_110058701154052.gif 666
27
+120437981_961865127657562_2352191202134666388_n_297241988832056.gif 17000
28
+133574591_2903570269883547_4546172544540158465_n_1987002451464904.gif 18000
29
+124066484_677676816444243_7811409333876486154_n_382212936779839.gif 20000
30
+130166493_156814772857168_4400190561706308563_n_1434915410205159.gif 21000
31
+131881117_200208011820369_5496884526316665472_n_614869492871247.gif 24000
32
+122477452_404917850669181_7425532495902993743_n_622781605380862.gif 25000
33
+60398112_324025954936328_3959780282919288832_n_1298063493943852.gif 26400
34
+132605238_2250185215114171_4387582615384925988_n_1562306254123019.gif 29000
35
+83715267_525261428388982_9213116445225910272_n_408695157299629.gif 30000
36
+223698887_1438459123189233_486429511094530589_n_2894947440818658.gif 40000
37
+245828092_2845723112338892_4090190909716007091_n_2845723109005559.jpg 36399
38
+246367043_846550806036575_350641140426701499_n_846550802703242.jpg 36400
39
+247417430_407831760973532_6702356361214642186_n_407831757640199.jpg 36401
40
+274103826_545381123173322_5027057711080616063_n_545381116506656.jpg 60909
41
+273830541_343104791045273_1854911206287093351_n_343104784378607.jpg 60910
42
+273907776_234819785533495_1080142729732940044_n_234819782200162.jpg 60911
43
+274242881_999574723977290_4022657018268260987_n_999574720643957.jpg 60912
44
+274008762_640199633904372_3459422682721277586_n_640199623904373.jpg 60913
45
+audioclip16419453900003855_1094480794455461.mp4 57612

+ 132
- 0
million/analyze/count_analysis.py ファイルの表示

@@ -0,0 +1,132 @@
1
+from typing import List
2
+import million.analyze.message_evaluation as msg_val
3
+from million.model.message import Message
4
+
5
+
6
+def check_extra_or_missing_letter(word: str, reference: str) -> bool:
7
+    """
8
+    Cette méthode vérifie si la str word contient une et une seule lettre
9
+    de trop ou de moins par rapport à la str reference
10
+    """
11
+    len_word = len(word)
12
+    len_ref = len(reference)
13
+
14
+    if abs(len_word - len_ref) != 1:
15
+        return False
16
+
17
+    shortest = word if len_word < len_ref else reference
18
+    longest = word if len_word > len_ref else reference
19
+
20
+    for i in range(len(shortest)):
21
+        if shortest[i] != longest[i]:
22
+            return shortest[i:] == longest[i + 1 :]
23
+
24
+    return True
25
+
26
+
27
+def check_single_letter_differ(word: str, reference: str) -> bool:
28
+    """
29
+    Cette méthode vérifie si la str word contient une et une seule
30
+    lettre différente par rapport à la str reference
31
+    """
32
+    return sum(1 for x, y in zip(reference, word) if x != y) == 1
33
+
34
+
35
+def check_letter_swap(word: str, reference: str) -> bool:
36
+    """
37
+    Cette méthode vérifie si la str word contient un et un seul
38
+    échange de lettres consécutives par rapport à la str reference
39
+    """
40
+    if len(word) != len(reference):
41
+        return False
42
+
43
+    for i in range(len(word) - 1):
44
+        if word[i] != reference[i]:
45
+            return word[i + 1] + word[i] + word[i + 2 :] == reference[i:]
46
+
47
+    return False
48
+
49
+
50
+def check_typo(word: str, reference: str) -> bool:
51
+    """
52
+    Cette méthode vérifie si la str word contient une typo en se référant à la str reference
53
+    """
54
+    if len(reference) == len(word):
55
+        return check_single_letter_differ(word, reference) or check_letter_swap(
56
+            word, reference
57
+        )
58
+    else:
59
+        return check_extra_or_missing_letter(word, reference)
60
+
61
+
62
+def _check_message_concatenation(messages: List[Message], index: int, expected: int) -> bool:
63
+    """
64
+    Cette méthode détermine si la liste messages contient le compte expected à partir de l'index donné
65
+    en concaténant les valeurs des messages suivants.
66
+    Cette méthode permet de trouver un compte qui a été étalé sur plusieurs messages
67
+    """
68
+    reference = str(expected)
69
+    testing = ""
70
+
71
+    offset = 0
72
+
73
+    while len(testing) < len(reference):
74
+        next_message = messages[index + offset]
75
+        offset += 1    
76
+        if next_message.sender_name == messages[index].sender_name:
77
+            testing += str(msg_val.get(next_message))
78
+
79
+    return testing == reference
80
+
81
+
82
+def _heavy_check(messages: List[Message], index: int, expected: int) -> bool:
83
+    """
84
+    Cette méthode détermine si la liste messages contient le compte expected à partir de l'index donné.
85
+    Elle utilise pour cela des méthodes complexes qui ne permettent de trouver un résultat
86
+    seulement si on est sortis du cas nominal
87
+    """
88
+    # TODO
89
+    #   - créer une méthode pour gérer le cas où plusieurs comptages sont contenus dans le même corps de message
90
+    #   - créer une méthode pour le cas où les chiffres sont représentés par un substitut au sein du corps du message
91
+    #     i.e. un nombre écrit en toutes lettres (français ou breton), 🍁 pour 420, @Elias Cheddar pour 69
92
+    m = messages[index]
93
+    word = str(msg_val.get(m))
94
+
95
+    return _check_message_concatenation(messages, index, expected) or \
96
+        check_typo(word, str(expected)) and msg_val.get(messages[index+1]) == expected+1
97
+
98
+
99
+def _check_value_around(messages, index, expected, amplitude_after, amplitude_before):
100
+    for i in range(1, amplitude_after + 1):
101
+        if index + i < len(messages) and expected == msg_val.get(messages[index + i]):
102
+            return index + i
103
+    for i in range(1, amplitude_before + 1):
104
+        if expected == msg_val.get(messages[index - i]):
105
+            return index - i
106
+
107
+    return None
108
+
109
+
110
+def search_value_at(messages, index, expected, do_heavy_check=True, amplitude_after=1000, amplitude_before=10):
111
+    """
112
+    Cette méthode détermine si la liste messages contient le compte expected à partir de l'index donné.
113
+    Le paramètre amplitude détermine la plage où effectuer les recherches autour de l'index donné.
114
+    Le paramètre do_heavy_check précise si on doit pousser l'analyse avec des méthodes plus lourdes en cas d'échec
115
+    """
116
+    # Si le message courant contient la valeur, on renvoie
117
+    curr_value = msg_val.get(messages[index])
118
+    if expected == curr_value:
119
+        return index
120
+
121
+    # Sinon on regarde aux alentours
122
+    jump_index = _check_value_around(messages, index, expected, amplitude_after, amplitude_before)
123
+    if jump_index is not None:
124
+        return jump_index
125
+
126
+    # Enfin, si on ne trouve pas la valeur à l'index donné et dans l'amplitude donnée
127
+    # On performe une vérification lourde à cet endroit
128
+    if do_heavy_check and _heavy_check(messages, index, expected):
129
+        return index
130
+
131
+    # Si tout cela n'a rien donné, on renvoie None
132
+    return None

million/analyze/dns_solver.py → million/analyze/media_count_mapper.py ファイルの表示

@@ -1,11 +1,12 @@
1
+from os.path import basename
1 2
 from typing import Dict
2 3
 
3 4
 from pydantic import BaseModel, PrivateAttr
4 5
 from million.model.message import Message
5 6
 
6
-_default_file_path = './DNS'
7
+_default_file_path = 'data/DefaultMediaCountMapFile'
7 8
 
8
-class DNS_solver(BaseModel):
9
+class MediaCountMapper(BaseModel):
9 10
     file_path:str = _default_file_path
10 11
 
11 12
     _bank: Dict[str, int] | None = PrivateAttr(None)
@@ -33,4 +34,5 @@ class DNS_solver(BaseModel):
33 34
         # look into msg attributes
34 35
         # find uri
35 36
         return (msg.share or None) and msg.share.link or \
36
-            (msg.gifs or None) and msg.gifs[0].uri
37
+            (msg.gifs or None) and basename(msg.gifs[0].uri) or \
38
+            (msg.photos or None) and basename(msg.photos[0].uri)

+ 15
- 12
million/analyze/message_evaluation.py ファイルの表示

@@ -1,11 +1,12 @@
1 1
 import re
2 2
 from typing import Dict
3 3
 from million.model.message import Message
4
-import million.analyze.dns_solver as dns
4
+import million.analyze.media_count_mapper as mcm
5 5
 
6 6
 
7 7
 _memoization: Dict[Message, int] = {}
8
-_dns_solver: dns.DNS_solver = dns.DNS_solver()
8
+_dns_solver: mcm.MediaCountMapper = mcm.MediaCountMapper()
9
+
9 10
 
10 11
 def get(msg: Message) -> int:
11 12
     """
@@ -13,6 +14,7 @@ def get(msg: Message) -> int:
13 14
     """
14 15
     return _memoization.get(msg, _compute(msg))
15 16
 
17
+
16 18
 def reset(msg: Message) -> None:
17 19
     """
18 20
     Drop memorized value of this Message
@@ -20,6 +22,7 @@ def reset(msg: Message) -> None:
20 22
     if msg in _memoization:
21 23
         _memoization.pop(msg)
22 24
 
25
+
23 26
 def reset() -> None:
24 27
     """
25 28
     Drop every memorized message value
@@ -28,20 +31,20 @@ def reset() -> None:
28 31
 
29 32
 
30 33
 def _compute(msg: Message) -> int:
31
-    value = _dns_solver.solve(msg) or \
32
-        _computeContent(msg) or \
33
-        None
34
+    value = _dns_solver.solve(msg) or _computeContent(msg) or None
34 35
 
35 36
     _memoization[msg] = value
36 37
     return value
37 38
 
39
+
38 40
 def _computeContent(msg: Message) -> int:
39
-    # TODO parse potential math expressions in content
40
-    match = msg.content and re.search(r"\d+", msg.content)
41
+    if not msg.content:
42
+        return
41 43
     
44
+    s = re.sub(r'[^\s\d.,]|[.,]{2,}',"", msg.content)
45
+    match = re.search(r"\d+", s)
46
+
42 47
     if match:
43
-        value = int(match.group())
44
-    else:
45
-        value = None
46
-    
47
-    return value
48
+        return int(match.group())
49
+
50
+    return None

+ 15
- 3
million/model/message.py ファイルの表示

@@ -1,7 +1,7 @@
1 1
 from datetime import datetime
2 2
 from typing import Any, List
3 3
 from uuid import uuid4
4
-from pydantic import BaseModel, Field, PrivateAttr, computed_field, validator
4
+from pydantic import BaseModel, Field, PrivateAttr, computed_field, field_validator
5 5
 
6 6
 
7 7
 class Reaction(BaseModel):
@@ -57,7 +57,19 @@ class Message(BaseModel):
57 57
 
58 58
     def __str__(self) -> str:
59 59
         dt_str = self.date_time.strftime("%d/%m/%Y, %H:%M:%S")
60
-        return f"{self.sender_name}({dt_str}) : {self.content}"
60
+
61
+        msg_str = f"{self.sender_name}({dt_str})"
62
+
63
+        if self.content:
64
+            msg_str += " : " + self.content
65
+        if self.photos:
66
+            msg_str += f" [PHOTOS {len(self.photos)}]"
67
+        if self.videos:
68
+            msg_str += f" [VIDEOS {len(self.videos)}]"
69
+        if self.gifs:
70
+            msg_str += f" [GIFS {len(self.gifs)}]"
71
+
72
+        return msg_str
61 73
 
62 74
     def __hash__(self) -> int:
63 75
         return hash(self.item_id)
@@ -67,7 +79,7 @@ class Message(BaseModel):
67 79
     def item_id(self) -> str:
68 80
         return self._id
69 81
 
70
-    @validator("date_time", pre=True, always=True)
82
+    @field_validator("date_time")
71 83
     def parse_timestamp(cls, v):
72 84
         if isinstance(v, int):
73 85
             return datetime.fromtimestamp(v / 1000)

+ 25
- 0
scripts/find_missing.py ファイルの表示

@@ -0,0 +1,25 @@
1
+import million.analyze.message_evaluation as msg_val
2
+import million.parse.fb_exports as fb
3
+import time
4
+
5
+export = fb.parse_dirfiles("./data")
6
+messages = export.messages
7
+
8
+counts = {val for m in messages if (val := msg_val.get(m)) and val <= 1_000_000}
9
+counts = sorted(counts)
10
+
11
+expected_value = 1
12
+intervals = []
13
+
14
+for value in counts:
15
+    if value != expected_value:
16
+        interval_length = value - expected_value
17
+
18
+        if interval_length == 1:
19
+            intervals.append(str(expected_value))
20
+        else:
21
+            intervals.append(f"{expected_value}..{value - 1}")
22
+
23
+    expected_value = value + 1
24
+
25
+print(intervals)

+ 34
- 0
scripts/test_count_analysis.py ファイルの表示

@@ -0,0 +1,34 @@
1
+import million.parse.fb_exports as fb
2
+import million.analyze.message_evaluation as msg_val
3
+from million.analyze.count_analysis import  search_value_at
4
+
5
+
6
+DATA_PATH = "./data/"
7
+export = fb.parse_dirfiles(DATA_PATH)
8
+messages = export.messages
9
+
10
+expected = 0
11
+idx = 0
12
+total_len = len(messages)
13
+total_as_percent = 100 / total_len
14
+
15
+with open('output/analysis_breakdown.txt', 'w', encoding="utf-8") as fichier:
16
+    while idx < total_len:
17
+        print(f"\r{round(idx * total_as_percent, 1)}%", end="")
18
+
19
+        # skip messages with no detected value
20
+        if msg_val.get(messages[idx]) == None:
21
+            idx += 1
22
+            continue
23
+
24
+        expected += 1
25
+
26
+        found_index = search_value_at(messages, idx, expected)
27
+        
28
+        if found_index:
29
+            fichier.write(f"{expected}\t⇒{messages[found_index]}\n")
30
+            idx = found_index + 1
31
+        else:
32
+            fichier.write(f"{expected}[X]\t⇒{messages[idx]}\n")
33
+
34
+print("\nComplete analysis in: output/analysis_breakdown.txt")

+ 44
- 12
test/model/message_test.py ファイルの表示

@@ -1,28 +1,60 @@
1
-
2
-
3
-from million.model.message import Message
4 1
 import million.analyze.message_evaluation as msg_val
5 2
 from test.TestCase import TestCase
6 3
 
7 4
 
8 5
 class MessageTest(TestCase):
9 6
 
10
-    def test_message_nominal(self, overrides=None, exclude=None):
7
+    def test_single_digit(self, overrides=None, exclude=None):
11 8
         message = self._message_with_text("1")
12 9
 
13 10
         assert 1 == msg_val.get(message)
14 11
 
12
+    def test_nothing(self, overrides=None, exclude=None):
13
+        message = self._message_with_text("")
14
+
15
+        assert None == msg_val.get(message)
16
+
17
+    def test_message_nominal(self, overrides=None, exclude=None):
18
+        message = self._message_with_text("1234")
19
+
20
+        assert 1234 == msg_val.get(message)
21
+
15 22
     def test_message_with_text(self, overrides=None, exclude=None):
16
-        message = self._message_with_text("1 text")
23
+        message = self._message_with_text("... 😏😏 269")
17 24
 
18
-        assert 1 == msg_val.get(message)
25
+        assert 269 == msg_val.get(message)
19 26
 
20
-    def test_message_floored_dot(self, overrides=None, exclude=None):
21
-        message = self._message_with_text("1.5")
27
+    def test_message_with_text_2(self, overrides=None, exclude=None):
28
+        message = self._message_with_text("331 allez la")
22 29
 
23
-        assert 1 == msg_val.get(message)
30
+        assert 331 == msg_val.get(message)
24 31
 
25
-    def test_message_floored_comma(self, overrides=None, exclude=None):
26
-        message = self._message_with_text("1,5")
32
+    def test_message_with_text_3(self, overrides=None, exclude=None):
33
+        message = self._message_with_text("Ok 2160")
27 34
 
28
-        assert 1 == msg_val.get(message)
35
+        assert 2160 == msg_val.get(message)
36
+
37
+    def test_message_value_cut(self, overrides=None, exclude=None):
38
+        message = self._message_with_text("66...😏😏😏9")
39
+
40
+        assert 669 == msg_val.get(message)
41
+
42
+    def test_message_value_cut_2(self, overrides=None, exclude=None):
43
+        message = self._message_with_text("82heyyyyyy69")
44
+
45
+        assert 8269 == msg_val.get(message)
46
+
47
+    def test_message_value_cut_2(self, overrides=None, exclude=None):
48
+        message = self._message_with_text("9339 9339 9339 9339")
49
+
50
+        assert 9339 == msg_val.get(message)
51
+
52
+    def test_message_in_middle(self, overrides=None, exclude=None):
53
+        message = self._message_with_text("A peine 5565 ouais...")
54
+
55
+        assert 5565 == msg_val.get(message)
56
+
57
+    def test_message_float_1(self, overrides=None, exclude=None):
58
+        message = self._message_with_text("11111,1111111111111111¼")
59
+
60
+        assert 11111 == msg_val.get(message)

+ 76
- 0
test/model/typo_test.py ファイルの表示

@@ -0,0 +1,76 @@
1
+import million.analyze.count_analysis as ca
2
+from test.TestCase import TestCase
3
+
4
+
5
+class TypoTest(TestCase):
6
+
7
+    def test_missing_letter_1(self, overrides=None, exclude=None):
8
+        assert ca.check_extra_or_missing_letter("4976", "45976") == True
9
+
10
+    def test_missing_letter_2(self, overrides=None, exclude=None):
11
+        assert ca.check_extra_or_missing_letter("4596", "45976") == True
12
+
13
+    def test_missing_letter_3(self, overrides=None, exclude=None):
14
+        assert ca.check_extra_or_missing_letter("5976", "45976") == True
15
+
16
+    def test_missing_letter_4(self, overrides=None, exclude=None):
17
+        assert ca.check_extra_or_missing_letter("4597", "45976") == True
18
+
19
+
20
+    
21
+    def test_extra_letter_1(self, overrides=None, exclude=None):
22
+        assert ca.check_extra_or_missing_letter("459766", "45976") == True
23
+
24
+    def test_extra_letter_2(self, overrides=None, exclude=None):
25
+        assert ca.check_extra_or_missing_letter("545976", "45976") == True
26
+
27
+    def test_extra_letter_3(self, overrides=None, exclude=None):
28
+        assert ca.check_extra_or_missing_letter("452976", "45976") == True
29
+
30
+    def test_extra_letter_4(self, overrides=None, exclude=None):
31
+        assert ca.check_extra_or_missing_letter("459776", "45976") == True
32
+
33
+    def test_extra_letter_5(self, overrides=None, exclude=None):
34
+        assert ca.check_extra_or_missing_letter("45976", "45976") == False
35
+
36
+
37
+    def test_single_letter_differ_1(self, overrides=None, exclude=None):
38
+        assert ca.check_single_letter_differ("35976", "45976") == True
39
+
40
+    def test_single_letter_differ_2(self, overrides=None, exclude=None):
41
+        assert ca.check_single_letter_differ("45986", "45976") == True
42
+
43
+    def test_single_letter_differ_3(self, overrides=None, exclude=None):
44
+        assert ca.check_single_letter_differ("44986", "45976") == False
45
+
46
+    def test_single_letter_differ_4(self, overrides=None, exclude=None):
47
+        assert ca.check_single_letter_differ("35975", "45976") == False
48
+
49
+    def test_single_letter_differ_5(self, overrides=None, exclude=None):
50
+        assert ca.check_single_letter_differ("4976", "45976") == False
51
+
52
+    def test_single_letter_differ_6(self, overrides=None, exclude=None):
53
+        assert ca.check_single_letter_differ("4597", "45976") == False
54
+        
55
+    def test_single_letter_differ_7(self, overrides=None, exclude=None):
56
+        assert ca.check_single_letter_differ("45976", "45976") == False
57
+
58
+
59
+    
60
+    def test_letter_swap_1(self, overrides=None, exclude=None):
61
+        assert ca.check_letter_swap("45976", "45976") == False
62
+
63
+    def test_letter_swap_2(self, overrides=None, exclude=None):
64
+        assert ca.check_letter_swap("49576", "45976") == True
65
+
66
+    def test_letter_swap_3(self, overrides=None, exclude=None):
67
+        assert ca.check_letter_swap("45967", "45976") == True
68
+
69
+    def test_letter_swap_4(self, overrides=None, exclude=None):
70
+        assert ca.check_letter_swap("47956", "45976") == False
71
+
72
+    def test_letter_swap_5(self, overrides=None, exclude=None):
73
+        assert ca.check_letter_swap("54966", "45976") == False
74
+
75
+    def test_letter_swap_6(self, overrides=None, exclude=None):
76
+        assert ca.check_letter_swap("54967", "45976") == False

読み込み中…
キャンセル
保存