export_data-v2.py 5.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. # ------------------------------------------------
  2. # Imports
  3. # ------------------------------------------------
  4. from matplotlib import pyplot as plt
  5. from datetime import datetime
  6. import os
  7. import json
  8. import re
  9. import sys, getopt
  10. # ------------------------------------------------
  11. # Globals
  12. # ------------------------------------------------
  13. DATA_PATH = './data/'
  14. MONTH_MODE = False
  15. MONTH = None
  16. OTHER_LABEL = 'Les Autres'
  17. # JSON tags
  18. PARTICIPANTS = 'participants'
  19. MESSAGES = 'messages'
  20. NAME = 'name'
  21. CONTENT = 'content'
  22. TIMESTAMP = 'timestamp_ms'
  23. SENDER = 'sender_name'
  24. HELP = """General options :
  25. -h, --help Consulter l'aide
  26. --path=<path> Redéfinir le chemin d'accès aux données (par défaut ./data)
  27. --month <mm/yyyy>
  28. """
  29. # ------------------------------------------------
  30. # Functions
  31. # ------------------------------------------------
  32. def printHelp():
  33. print('Usage:\n '+os.path.basename(__file__)+' <command> [option]\n')
  34. print(HELP)
  35. sys.exit(2)
  36. def handleArguments(argv):
  37. global MONTH_MODE
  38. global MONTH
  39. try:
  40. opts, args = getopt.getopt(argv, 'h',['help','path=', 'month='])
  41. except getopt.GetoptError:
  42. printHelp()
  43. for opt, arg in opts:
  44. if opt in ('-h', '--help'):
  45. print(HELP)
  46. sys.exit()
  47. elif opt in ('--path'):
  48. DATA_PATH = arg
  49. elif opt in ('--month'):
  50. MONTH_MODE = True
  51. try:
  52. t = arg.split("/")
  53. int(t[0])
  54. int(t[1])
  55. MONTH = t
  56. except:
  57. printHelp()
  58. def readBrokenFbJson(datafile_path):
  59. # ntm facebook
  60. # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
  61. with open(datafile_path, 'rb') as data_file:
  62. binary_data = data_file.read()
  63. replace_func = lambda m: bytes.fromhex(m.group(1).decode())
  64. pattern = rb'\\u00([\da-f]{2})'
  65. repaired = re.sub(pattern, replace_func, binary_data)
  66. return json.loads(repaired.decode('utf8'))
  67. def computeData():
  68. # Tous les fichiers du dossier sont traités sans distinction
  69. datafiles_path = [DATA_PATH + filename for filename in os.listdir(DATA_PATH)]
  70. messages, participants = [], []
  71. for datafile_path in datafiles_path:
  72. datacontent = readBrokenFbJson(datafile_path)
  73. if datacontent is None : continue
  74. participants += datacontent[PARTICIPANTS]
  75. messages += datacontent[MESSAGES]
  76. participants = cleanParticipants(participants)
  77. messages = cleanMessages(messages)
  78. return participants, messages
  79. def cleanParticipants(rawParticipants):
  80. return set([participant[NAME] for participant in rawParticipants])
  81. def cleanMessages(rawMessages):
  82. if MONTH_MODE:
  83. cleanMessages = [
  84. message for message in rawMessages if \
  85. CONTENT in message and \
  86. datetime.fromtimestamp(message[TIMESTAMP]/1000).month == int(MONTH[0]) and \
  87. datetime.fromtimestamp(message[TIMESTAMP]/1000).year == int(MONTH[1])
  88. ]
  89. else:
  90. cleanMessages = [message for message in rawMessages if CONTENT in message]
  91. return sorted(cleanMessages, key = lambda x: x[TIMESTAMP])
  92. # TODO tester l'approche en recherche incrémentale
  93. # Jeu de données du 14/10/2021. 33679 messages conservés
  94. # pour un compte final de 34120. Soit une perte estimée à 1.3%
  95. def filterMessages(messages):
  96. return [msg for msg in messages if re.search('(\d{2,}|^\d$)', msg[CONTENT])]
  97. def computeParticipation(messages):
  98. result = {}
  99. for message in messages:
  100. sender = message[SENDER]
  101. countParticipation(result, sender, message)
  102. return sorted(result.items(), key = lambda x: x[1])
  103. def countParticipation(participations, sender, message):
  104. participations[sender] = participations[sender] + 1 if sender in participations else 1
  105. def mergeSmallParticipation(rawParticipation, threshold = 1):
  106. values = [e[1] for e in rawParticipation]
  107. labels = [e[0] for e in rawParticipation]
  108. totalValues = sum(values)
  109. idx = 0
  110. for idx, value in enumerate(values):
  111. if 100 * value / totalValues >= threshold: break
  112. return [sum(values[0:idx])] + values[idx:], [OTHER_LABEL] + labels[idx:]
  113. def displayParticipation(participation):
  114. values, labels = mergeSmallParticipation(participation)
  115. plt.figure(figsize=(8,7), tight_layout = True)
  116. plt.pie(values,
  117. startangle = 90,
  118. counterclock = False,
  119. labels = labels,
  120. rotatelabels = True)
  121. plt.show()
  122. def consoleDisplay(participations):
  123. for participation in participations:
  124. print(participation)
  125. # ------------------------------------------------
  126. # Main Code
  127. # ------------------------------------------------
  128. def main(argv):
  129. handleArguments(argv)
  130. participants, messages = computeData()
  131. messages = filterMessages(messages)
  132. participation = computeParticipation(messages)
  133. consoleDisplay(participation)
  134. displayParticipation(participation)
  135. if __name__ == "__main__":
  136. main(sys.argv[1:])