139 lines
5.5 KiB
Swift

import Foundation
import CocoaLumberjackSwift
protocol SamplingControllerDelegate: AnyObject {
var sessionID: String { get }
}
class SamplingController: NSObject {
/**
* Serial dispatch queue that enables working with properties in a thread-safe
* way
*/
private let queue = DispatchQueue(label: "EventPlatformClientSampling-" + UUID().uuidString)
/**
* Cache of "in sample" / "out of sample" determination for each stream
*
* The process of determining only has to happen the first time an event is
* logged to a stream for which stream configuration is available. All other
* times `in_sample` simply returns the cached determination.
*
* Only cache determinations asynchronously via `queue.async`
*/
private var samplingCache: [EventPlatformClient.Stream: Bool] = [:]
weak var delegate: SamplingControllerDelegate?
/**
* Compute a boolean function on a random identifier
* - Parameter stream: name of the stream
* - Parameter config: stream configuration for the provided stream name
* - Returns: `true` if in sample or `false` otherwise
*
* The determinations are lazy and cached, so each stream's in-sample vs
* out-of-sample determination is computed only once, the first time an event
* is logged to that stream.ß
*
* Refer to sampling settings section in
* [mw:Wikimedia Product/Analytics Infrastructure/Stream configuration](https://www.mediawiki.org/wiki/Wikimedia_Product/Analytics_Infrastructure/Stream_configuration)
* for more information.
*/
func inSample(stream: EventPlatformClient.Stream, config: EventPlatformClient.StreamConfiguration) -> Bool {
if let cachedValue = getSamplingForStream(stream) {
return cachedValue
}
guard let rate = config.sampling?.rate else {
/*
* If stream is present in streamConfigurations but doesn't have
* sampling settings, it is always in-sample.
*/
cacheSamplingForStream(stream, inSample: true)
return true
}
/*
* All platforms use session ID as the default identifier for determining
* in- vs out-of-sample of events sent to streams. On the web, streams can
* be set to use pageview token instead. On the apps, streams can be set
* to use device token instead.
*/
let sessionIdentifierType = "session"
let deviceIdentifierType = "device"
let identifierType = config.sampling?.identifier ?? sessionIdentifierType
let appInstallID = UserDefaults.standard.wmf_appInstallId
guard identifierType == sessionIdentifierType || identifierType == deviceIdentifierType else {
DDLogDebug("EPC: Logged to stream which is not configured for sampling based on \(sessionIdentifierType) or \(deviceIdentifierType) identifier")
cacheSamplingForStream(stream, inSample: false)
return false
}
guard let identifier = identifierType == sessionIdentifierType ? delegate?.sessionID : appInstallID else {
DDLogError("EPC: Missing token for determining in- vs out-of-sample. Falling back to out-of-sample.")
cacheSamplingForStream(stream, inSample: false)
return false
}
let result = determine(identifier, rate)
cacheSamplingForStream(stream, inSample: result)
return result
}
/**
* Yields a deterministic (not stochastic) determination of whether the
* provided `id` is in-sample or out-of-sample according to the `acceptance`
* rate
* - Parameter id: identifier to use for determining sampling
* - Parameter acceptance: the desired proportion of many `token`-s being
* accepted
*
* The algorithm works in a "widen the net on frozen fish" fashion -- tokens
* continue evaluating to true as the acceptance rate increases. For example,
* a device determined to be in-sample for a stream "A" having rate 0.1 will
* be determined to be in-sample for a stream "B" having rate 0.2, and its
* events will show up in tables "A" and "B".
*/
private func determine(_ id: String, _ acceptance: Double) -> Bool {
guard let token = UInt32(id.prefix(8), radix: 16) else {
return false
}
return (Double(token) / Double(UInt32.max)) < acceptance
}
/**
* Thread-safe asynchronous caching of a stream's in-vs-out-of-sample
* determination
* - Parameter stream: name of stream to cache determination for
* - Parameter inSample: whether the stream was determined to be in-sample
* this session
*/
func cacheSamplingForStream(_ stream: EventPlatformClient.Stream, inSample: Bool) {
queue.async {
self.samplingCache[stream] = inSample
}
}
/**
* Thread-safe synchronous retrieval of a stream's cached in-vs-out-of-sample determination
* - Parameter stream: name of stream to retrieve determination for from the cache
* - Returns: `true` if stream was determined to be in-sample this session, `false` otherwise
*/
func getSamplingForStream(_ stream: EventPlatformClient.Stream) -> Bool? {
queue.sync {
return self.samplingCache[stream]
}
}
/**
* Thread-safe asynchronous clearance of cached stream in-vs-out-of-sample determinations
*/
func removeAllSamplingCache() {
queue.async {
self.samplingCache.removeAll()
}
}
}