Du kannst nicht mehr als 25 Themen auswählen Themen müssen mit entweder einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

export_data-v2.py 4.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. # ------------------------------------------------
  2. # Imports
  3. # ------------------------------------------------
  4. from matplotlib import pyplot as plt
  5. import os
  6. import json
  7. import re
  8. import sys, getopt
  9. # ------------------------------------------------
  10. # Globals
  11. # ------------------------------------------------
  12. DATA_PATH = './data/'
  13. OTHER_LABEL = 'Les Autres'
  14. # JSON tags
  15. PARTICIPANTS = 'participants'
  16. MESSAGES = 'messages'
  17. NAME = 'name'
  18. CONTENT = 'content'
  19. TIMESTAMP = 'timestamp_ms'
  20. SENDER = 'sender_name'
  21. HELP = """General options :
  22. -h, --help Consulter l'aide
  23. --path=<path> Redéfinir le chemin d'accès aux données (par défaut ./data)
  24. """
  25. # ------------------------------------------------
  26. # Functions
  27. # ------------------------------------------------
  28. def handleArguments(argv):
  29. try:
  30. opts, args = getopt.getopt(argv, 'h',['help','path='])
  31. except getopt.GetoptError:
  32. print('Usage:\n '+os.path.basename(__file__)+' <command> [option]\n')
  33. print(HELP)
  34. sys.exit(2)
  35. for opt, arg in opts:
  36. if opt in ('-h', '--help'):
  37. print(HELP)
  38. sys.exit()
  39. elif opt in ('--path'):
  40. DATA_PATH = arg
  41. def readBrokenFbJson(datafile_path):
  42. # ntm facebook
  43. # https://stackoverflow.com/questions/50008296/facebook-json-badly-encoded
  44. with open(datafile_path, 'rb') as data_file:
  45. binary_data = data_file.read()
  46. replace_func = lambda m: bytes.fromhex(m.group(1).decode())
  47. pattern = rb'\\u00([\da-f]{2})'
  48. repaired = re.sub(pattern, replace_func, binary_data)
  49. return json.loads(repaired.decode('utf8'))
  50. def computeData():
  51. # Tous les fichiers du dossier sont traités sans distinction
  52. datafiles_path = [DATA_PATH + filename for filename in os.listdir(DATA_PATH)]
  53. messages, participants = [], []
  54. print(datafiles_path)
  55. for datafile_path in datafiles_path:
  56. datacontent = readBrokenFbJson(datafile_path)
  57. if datacontent is None : continue
  58. participants += datacontent[PARTICIPANTS]
  59. messages += datacontent[MESSAGES]
  60. participants = cleanParticipants(participants)
  61. messages = cleanMessages(messages)
  62. return participants, messages
  63. def cleanParticipants(rawParticipants):
  64. return set([participant[NAME] for participant in rawParticipants])
  65. def cleanMessages(rawMessages):
  66. cleanMessages = [message for message in rawMessages if CONTENT in message]
  67. return sorted(cleanMessages, key = lambda x: x[TIMESTAMP])
  68. # TODO tester l'approche en recherche incrémentale
  69. # Jeu de données du 14/10/2021. 33679 messages conservés
  70. # pour un compte final de 34120. Soit une perte estimée à 1.3%
  71. def filterMessages(messages):
  72. return [msg for msg in messages if re.search('(\d{2,}|^\d$)', msg[CONTENT])]
  73. def computeParticipation(messages):
  74. result = {}
  75. for message in messages:
  76. sender = message[SENDER]
  77. result[sender] = result[sender]+1 if sender in result else 1
  78. return sorted(result.items(), key = lambda x: x[1])
  79. def mergeSmallParticipation(rawParticipation, threshold = 1):
  80. values = [e[1] for e in rawParticipation]
  81. labels = [e[0] for e in rawParticipation]
  82. totalValues = sum(values)
  83. for idx, value in enumerate(values):
  84. if 100 * value / totalValues >= threshold: break
  85. return [sum(values[0:idx])] + values[idx:], [OTHER_LABEL] + labels[idx:]
  86. def displayParticipation(participation):
  87. values, labels = mergeSmallParticipation(participation)
  88. plt.figure(figsize=(8,7), tight_layout = True)
  89. plt.pie(values,
  90. startangle = 90,
  91. counterclock = False,
  92. labels = labels,
  93. rotatelabels = True)
  94. plt.show()
  95. # ------------------------------------------------
  96. # Main Code
  97. # ------------------------------------------------
  98. def main(argv):
  99. handleArguments(argv)
  100. participants, messages = computeData()
  101. messages = filterMessages(messages)
  102. participation = computeParticipation(messages)
  103. displayParticipation(participation)
  104. if __name__ == "__main__":
  105. main(sys.argv[1:])