Bladeren bron

algorithme alternatif pour trouver les trous

version fonctionnelle
feature/findHolesV2
Figg 6 maanden geleden
bovenliggende
commit
e9ed7129da
3 gewijzigde bestanden met toevoegingen van 89 en 49 verwijderingen
  1. 2
    0
      .gitignore
  2. 65
    44
      million/analyze/find_holes.py
  3. 22
    5
      scripts/find_holesV2.py

+ 2
- 0
.gitignore Bestand weergeven

@@ -10,6 +10,8 @@ _build
10 10
 .cache
11 11
 *.so
12 12
 
13
+output/*
14
+
13 15
 # Installer logs
14 16
 pip-log.txt
15 17
 

+ 65
- 44
million/analyze/find_holes.py Bestand weergeven

@@ -1,5 +1,5 @@
1 1
 
2
-
2
+import time
3 3
 from typing import List
4 4
 from million.model.hole import Hole
5 5
 from million.model.message import Message
@@ -72,52 +72,73 @@ def find_holes(messages: List[Message], accepted_max: int = 1_000_000) -> List[H
72 72
             ))
73 73
     return holes
74 74
 
75
-def find_holesV2(messages: List[Message]) -> List[Hole]:
75
+def _find_value_around_index(messages: List[Message], value, idx, amplitude) -> int:
76
+    check_value = lambda x: messages[x].get_counted_value() == value
77
+    
78
+    if check_value(idx): return idx
79
+
80
+    for offset in range(1, amplitude):
81
+        o_idx = idx + offset * +1
82
+        if check_value(o_idx):
83
+            return o_idx
84
+        
85
+        o_idx = idx + offset * -1
86
+        if check_value(o_idx):
87
+            return o_idx
88
+        
89
+    return -1
90
+
91
+def _open_sequence(sequences: List[Sequence], msg: Message):
92
+    sequence = Sequence(
93
+        start=msg.get_counted_value(),
94
+        start_message=msg,
95
+        end=-1,
96
+        end_message=msg
97
+        )
98
+    
99
+    sequences.append(sequence)
100
+
101
+def _close_sequence(sequences: List[Sequence]):
102
+    if len(sequences) == 0: return
103
+
104
+    sequences[-1].end = sequences[-1].end_message.get_counted_value()
105
+
106
+def _opened_sequence(sequences: List[Sequence]):
107
+    return len(sequences) > 0 and sequences[-1].end == -1
108
+
109
+def find_sequences_v2(messages: List[Message]) -> List[Sequence]:
76 110
     current = 1
77
-    msg_idx = 0
78
-    threshold = 1000
79
-    limitAhead = 100
80
-    limitBehind = 20
111
+    base_idx = 0
112
+    amplitude = 200
81 113
 
82
-    holes = []
114
+    sequences = []
83 115
 
84
-    while msg_idx < len(messages):
85
-        #search value current in messages from msgIdx, with lookahead then lookbehind
116
+    while base_idx < len(messages):
117
+        curr_idx = _find_value_around_index(messages, current, base_idx, amplitude)
118
+        print(f"searching {current} from [{messages[base_idx]}]\t-> {'Not found' if curr_idx == -1 else 'Itself' if curr_idx == base_idx else messages[curr_idx]}")
86 119
         
87
-        for i in range(0, limitAhead):
88
-            msgCurrent = messages[msg_idx + i]
89
-            
90
-            if msgCurrent.get_counted_value() == current: break
120
+        if curr_idx != -1: #trouvé
121
+
122
+            if not _opened_sequence(sequences):
123
+                _open_sequence(sequences, messages[curr_idx])
124
+            else:
125
+                sequences[-1].end_message = messages[curr_idx]
91 126
 
92
-        if msgCurrent.get_counted_value() != current:
93
-            for i in range(1, limitBehind):
94
-                msgCurrent = messages[msg_idx - i]
127
+            base_idx = curr_idx + 1
128
+            current += 1
129
+        else: # pas trouvé
95 130
             
96
-                if msgCurrent.get_counted_value() == current: break
97
-
98
-        if msgCurrent.get_counted_value() == current:
99
-            # la valeur current a été trouvé dans la zone de recherche
100
-            print(f"{msgCurrent.sender_name} : {msgCurrent.content}")
101
-            # si un trou était ouvert il faut le fermer
102
-            if len(holes) > 0 and holes[-1].end == 0:
103
-                holes[-1].end = current-1
104
-                holes[-1].end_message = msgCurrent
105
-                print(f"\t{current-1}")
106
-            msg_idx += 1
107
-        else:
108
-            # la valeur current n'a pas été trouvée
109
-            # on est dans un trou
110
-            # si aucun trou n'est ouvert, on en crée un
111
-            if len(holes) == 0 or holes[-1].end > 0:
112
-                hole = Hole(
113
-                    start=current,
114
-                    end=0,
115
-                    start_message=messages[msg_idx],
116
-                    end_message=Message(sender_name='',timestamp_ms=0)
117
-                    )
118
-                holes.append(hole)
119
-                print(f"\t HOLE : {hole.start}\n\t\t...")
120
-
121
-        current += 1
122
-    
123
-    return holes
131
+            # fermer la sequence si ouverte
132
+            if _opened_sequence(sequences):
133
+                _close_sequence(sequences)
134
+            
135
+            if messages[base_idx].get_counted_value() < current:
136
+                base_idx += 1
137
+            else:
138
+                current += 1
139
+
140
+
141
+        #time.sleep(.005)
142
+
143
+
144
+    return sequences

+ 22
- 5
scripts/find_holesV2.py Bestand weergeven

@@ -1,4 +1,5 @@
1
-from million.analyze.find_holes import find_holesV2
1
+from datetime import datetime
2
+from million.analyze.find_holes import find_sequences_v2
2 3
 from million.analyze.retain_counts import retain_counts
3 4
 from million.parse.fb_exports import FacebookExportParser
4 5
 
@@ -9,8 +10,24 @@ parser = FacebookExportParser()
9 10
 export = parser.parse(DATA_PATH)
10 11
 filtered = retain_counts(export.messages)
11 12
 
12
-holes = find_holesV2(filtered)
13
+sequences = find_sequences_v2(filtered)
13 14
 
14
-for hole in holes:
15
-    print(f"{hole.start} - {hole.end} ({hole.end - hole.start})")
16
-    print(hole.end_message)
15
+with open('output/holes.csv', 'w') as f:
16
+    f.write('début,fin,taille,responsable,date2\n')
17
+    for i in range(1, len(sequences)):
18
+        hole_start = sequences[i-1].end+1
19
+        hole_end = sequences[i].start-1
20
+        hole_end_message = sequences[i].start_message
21
+        hole_start_message = sequences[i-1].start_message
22
+
23
+        date_start = datetime.utcfromtimestamp(
24
+            hole_start_message.timestamp_ms / 1000.0).strftime('%Y-%m-%d %H:%M:%S')
25
+        date_end = datetime.utcfromtimestamp(
26
+            hole_end_message.timestamp_ms / 1000.0).strftime('%Y-%m-%d %H:%M:%S')
27
+        f.write(
28
+            f"{hole_start},"
29
+            f"{hole_end},"
30
+            f"{hole_end - hole_start + 1},"
31
+            f"{hole_end_message.sender_name},"
32
+            f"{date_start},{date_end}\n"
33
+        )

Laden…
Annuleren
Opslaan