roles/web-data-analysis/files/condense-hotspot.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88

#!/bin/bash

# This file is part of Fedora Project Infrastructure Ansible
# Repository.
#
# Fedora Project Infrastructure Ansible Repository is free software:
# you can redistribute it and/or modify it under the terms of the GNU
# General Public License as published by the Free Software Foundation,
# either version 3 of the License, or (at your option) any later
# version.
#
# Fedora Project Infrastructure Ansible Repository is distributed in
# the hope that it will be useful, but WITHOUT ANY WARRANTY; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License
# along with Fedora Project Infrastructure Ansible Repository.  If
# not, see <http://www.gnu.org/licenses/>.

# There is a multiday delay involved in processing the logs. It
# may take up to 4 days to get the logs to the main-server. It may
# take a day to combine all the logs onto combined-httpd. So we assume 
# we are 5 days behind.

let NUMDAYS=5
let OLDDAYS=$(( $NUMDAYS+1 ))

PROJECT=hotspot
WEBLOG=fedoraproject.org

# This is the year/month/day for a N days ago.
YEAR=$(/bin/date -d "-${NUMDAYS} days" +%Y)
MONTH=$(/bin/date -d "-${NUMDAYS} days" +%m)
DAY=$(/bin/date -d "-${NUMDAYS} days" +%d)

# And we have have to deal with year/month/day boundaries for our later grep.
OLDDATE=$(/bin/date -d "-${OLDDAYS} days" +%Y-%m-%d)
OLDYEAR=$(/bin/date -d "-${OLDDAYS} days" +%Y)

NFSDIR=/mnt/fedora_stats/combined-http
TARGET=${NFSDIR}/${YEAR}/${MONTH}/${DAY}

LOGFILE=${TARGET}/${WEBLOG}-access.log

WORKDIR=/mnt/fedora_stats/data/${PROJECT}
WORKFILE=${WORKDIR}/${YEAR}/${MONTH}/raw-${DAY}

WEBDIR=/var/www/html/csv-reports/${PROJECT}

TEMPDIR=$( mktemp -d /tmp/web-data-analysis.XXXXXXXXX )

LBIN=/usr/local/bin/
LSHARE=/usr/local/share/web-data-analysis

mkdir -p ${WORKDIR}/${YEAR}/${MONTH}
if [[ ! -f ${WORKDIR}/${YEAR}/out-${YEAR}-${MONTH} ]]; then
    touch ${WORKDIR}/${YEAR}/out-${YEAR}-${MONTH}
fi

if [[ ! -f ${WORKDIR}/out-${YEAR} ]]; then
    touch ${WORKDIR}/out-${YEAR}
fi

if [[ ! -f ${LOGFILE} ]]; then
    echo "No logfile found for ${YEAR}/${MONTH}/${DAY}. Please fix."
else
    awk -f ${LSHARE}/${PROJECT}.awk ${LOGFILE} > ${WORKFILE}
fi

# So the data isn't strictly across month boundries due to the end of
# the logfiles being at 04:00 versus 23:59. Also log files might get
# stuck and you end up with days or weeks of data in a single
# file. Because the data is pretty small we can get away with adding up the data every day.

find ${WORKDIR} -type f | grep raw- | xargs cat  | sort -u | awk 'BEGIN{x=0; y=0}; {if (x != $1){ print x,y; x=$1; y=$2} else {y=y+$2}}' > ${WORKDIR}/worked-all


awk -f ${LSHARE}/${PROJECT}-data.awk ${WORKDIR}/worked-all | grep -v "1970-01-01,0,0,0" | sort -u > ${WEBDIR}/${PROJECT}data-all.csv

# Make the seven day moving average file
/usr/local/bin/hotspot-moving_avg.py > ${WEBDIR}/${PROJECT}data-all-7day-ma.csv

gnuplot  ${LSHARE}/${PROJECT}.gp

# cleanup the temp data
rm -rf ${TEMPDIR}